In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import tensorflow as tf

In [4]:
from string import punctuation

In [5]:
import nltk    
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
from tensorflow.keras.models import Sequential

In [8]:
from tensorflow.keras.layers import CategoryEncoding, Input, Dense, StringLookup, Embedding, LSTM

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
nltk.download('punkt') # Necessário para word_tokenize
nltk.download('stopwords')
palavras_remover = set(stopwords.words())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df = pd.read_csv("../../../dados/nlp/news_sentiment_analysis.csv", encoding="utf-8")

In [12]:
df.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


In [13]:
def limpar_texto( texto ):
    translator = str.maketrans('—’', '  ', punctuation)
    texto_limpo = texto.lower().translate( translator )
    tokens = word_tokenize(texto_limpo)
    tokens_limpos = []
    for token in tokens:
        if token not in palavras_remover:
            tokens_limpos.append(token)
    return " ".join(tokens_limpos)

In [14]:
df["TextClean"] = df["Description"].apply(limpar_texto)

In [15]:
MAX_PALAVRAS = 5000
tokenizer = Tokenizer(num_words = MAX_PALAVRAS, oov_token="<UNKNOW>") # OOV = Out Of Vocabulary

In [16]:
tokenizer.fit_on_texts( df["TextClean"] )

In [138]:
len(tokenizer.word_index)
# tokenizer.word_index["firstyear"]

20723

In [18]:
token_sequences = tokenizer.texts_to_sequences( df["TextClean"] )

In [19]:
len(token_sequences[2])

30

In [20]:
# token_sequences[0]
# [365,  1,  1, 1, 292, 1, 1, 1, 117, 217, 161, 233, 61, 713, 13, 863, 1, 1, 89, 1, 1, 1, 1, 453, 1, 1, 1, 820, 1, 90, 1, 1, 785, 97, 1, 5]
# token_sequences[1]
# [1, 1, 76, 165, 86, 1, 1, 1, 1, 1]

In [21]:
# Palavras      Label
# [365,  1]       1
# [365, 1, 1]     1
# [365, 1, 1, 1]  292 

In [22]:
entrada_features = []
saida_label = []
for tokens in token_sequences:
    for i in range(2, len(tokens)):
        features = tokens[0:i]
        label = tokens[i]
        # print(f"{i} - Tokens: ", tokens, "      Features: ", features,  "    Label: ", label)
        entrada_features.append( features )
        saida_label.append( label )
        # print(f"Features: {features}           Label: {label}")
# entrada_features
# saida_label

In [23]:
len(entrada_features)

96028

In [24]:
sequencias = pad_sequences( entrada_features, padding="pre" )
sequencias.shape

(96028, 70)

In [25]:
sequencias[0:4]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 365,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0, 365,   1,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0

In [26]:
saida_label[0:4]

[1, 1, 292, 3988]

In [27]:
MAX_PALAVRAS

5000

In [28]:
output_encoder = CategoryEncoding(num_tokens=MAX_PALAVRAS, output_mode="one_hot")
saida_encoded = output_encoder( saida_label ).numpy()
saida_encoded[0:10]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [76]:
modelo = Sequential()
modelo.add( Embedding( input_dim=(MAX_PALAVRAS + 1), output_dim=50, mask_zero=False))
modelo.add( LSTM( 128 ) )
modelo.add( Dense( MAX_PALAVRAS, activation="softmax") )
modelo.summary()
        

In [78]:
modelo.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["Accuracy"])

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split( sequencias, saida_encoded, random_state=50, test_size=0.2 )

In [81]:
X_train.shape, X_test.shape

((76822, 70), (19206, 70))

In [82]:
Y_train.shape, Y_test.shape

((76822, 5000), (19206, 5000))

In [None]:
Y_test.shape

In [83]:
resultado = modelo.fit( X_train, Y_train, epochs=15, batch_size=32 )

Epoch 1/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 79ms/step - Accuracy: 0.2411 - loss: 6.6914
Epoch 2/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 82ms/step - Accuracy: 0.2898 - loss: 5.4391
Epoch 3/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 81ms/step - Accuracy: 0.3551 - loss: 4.6784
Epoch 4/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 82ms/step - Accuracy: 0.3960 - loss: 4.1581
Epoch 5/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 68ms/step - Accuracy: 0.4333 - loss: 3.7259
Epoch 6/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 80ms/step - Accuracy: 0.4677 - loss: 3.3360
Epoch 7/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 102ms/step - Accuracy: 0.5006 - loss: 3.0066
Epoch 8/15
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 67ms/step - Accuracy: 0.5378 - loss: 2.6920

In [88]:
modelo.evaluate( X_test, Y_test )

[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 37ms/step - Accuracy: 0.5397 - loss: 3.9720


[4.029179573059082, 0.5426949858665466]

In [116]:
df["TextClean"][10]

'drink kirland ellis partner flexes dexterity mid business callgotta love rankings check list amazing small boutique midsize firmsreally details bar legal educators show concern means studentsshaky memory absent law license florida supreme court strips former judge ability practiceimmunity immunity trump thinks yelling repeatedly rid legal troublesthe post quick wits quicker reflexes 8212 appeared law'

In [104]:
def predict_next_word( text ):
    predict_tokens = tokenizer.texts_to_sequences( [ text ] )
    predict_padded = pad_sequences( predict_tokens, maxlen=sequencias.shape[1], padding="pre" )
    output_vector = modelo.predict( predict_padded )
    output_index = np.argmax( output_vector )
    palavra = "<NAO ENCONTRADA>"
    for token, index in tokenizer.word_index.items():
        if index == output_index:
            palavra = token
            break
    return palavra
        

In [134]:
texto = "drink kirland ellis partner flexes dexterity mid business callgotta love rankings check list amazing small boutique midsize firmsreally details bar legal educators show concern means studentsshaky memory absent law license florida supreme court strips former judge ability practiceimmunity"

In [136]:
predict_next_word( texto )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 348ms/step


'<UNKNOW>'