In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import random
import matplotlib.pyplot as plt

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import CategoryEncoding, Embedding, GlobalAveragePooling1D, LSTM
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD, Adam, RMSprop

In [3]:
df = pd.read_csv("../../../dados/nlp/news_sentiment_analysis.csv", encoding="utf-8")

In [4]:
df_news = df[ ["Description"] ]

In [30]:
num_palavra = 3000

In [5]:
tokenizer = Tokenizer(num_words=num_palavra, oov_token="<OOV>", lower=True)
tokenizer.fit_on_texts( df_news["Description"] )
vocabulario = tokenizer.word_index
len(vocabulario)

20860

In [6]:
entrada_sequencias = tokenizer.texts_to_sequences( df_news["Description"] )

In [None]:
entrada_features = []
saida_classes = []
for indice, seq_palavras in enumerate(entrada_sequencias):
    for token_indice, token in enumerate(seq_palavras):
        if token_indice > 0:
            sequencia = seq_palavras[0:token_indice + 1]
            saida_classes.append(sequencia.pop())
            entrada_features.append(sequencia)

In [42]:
len(entrada_features) / 32

5706.96875

In [22]:
entrada_features[0:5]

[[599], [599, 1], [599, 1, 118], [599, 1, 118, 1], [599, 1, 118, 1, 1]]

In [9]:
saida_classes[0:5

[1, 118, 1, 1, 6]

In [20]:
max_size_entrada = 0
for features in entrada_features:
    if max_size_entrada < len(features):
        max_size_entrada = len(features)
max_size_entrada

111

In [24]:
entrada_padded = pad_sequences(entrada_features, maxlen=max_size_entrada, padding='pre', value=0)

In [38]:
output_encoder = CategoryEncoding(num_tokens=num_palavra, output_mode="one_hot")
saida_encoded = output_encoder( saida_classes )
saida_encoded[0:5]

<tf.Tensor: shape=(5, 3000), dtype=float32, numpy=
array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [48]:
modelo = Sequential()
modelo.add( Embedding( input_dim=(num_palavra + 1), output_dim=100, mask_zero=False ) )
# modelo.add( GlobalAveragePooling1D() )
modelo.add( LSTM( 64 ) )
modelo.add( Dense( num_palavra, activation="softmax" ) )
modelo.summary()

In [50]:
modelo.compile(optimizer="adam", metrics=["accuracy"], loss="categorical_crossentropy")

In [52]:
resultado = modelo.fit(entrada_padded, saida_encoded, epochs=10, batch_size=32) 

Epoch 1/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 55ms/step - accuracy: 0.2285 - loss: 5.6393
Epoch 2/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 47ms/step - accuracy: 0.3264 - loss: 4.3338
Epoch 3/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 43ms/step - accuracy: 0.3815 - loss: 3.7973
Epoch 4/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 46ms/step - accuracy: 0.4157 - loss: 3.4447
Epoch 5/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 41ms/step - accuracy: 0.4364 - loss: 3.2284
Epoch 6/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 49ms/step - accuracy: 0.4574 - loss: 3.0248
Epoch 7/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 47ms/step - accuracy: 0.4766 - loss: 2.8582
Epoch 8/10
[1m5707/5707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 37ms/step - accuracy: 0.4928 - loss: 2.7257


In [60]:
df_news["Description"][100]

'In the July 9 episode of EssentiallySports Think Tank, industry titan Rob Kligman, Chief Revenue Officer of Anthem Sports Group and former WWE executive, pulls back the curtain on the future of sports entertainment. The Ex-VP of Global Digital & Integrated Sponsorship Sales at WWE—a seasoned leader with a proven track record—offers C-suite executives an [&#8230;]The post Mastermind Behind WWE’s Netflix Deal Predicts a Bold New Chapter With UFC and TNA: “Sky’s the Limit” appeared first on EssentiallySports.'

sequencia_predict:  [[   1   46   98    5  192 1038  758]]


In [112]:
texto = "In the July 9 episode of EssentiallySports Think Tank"
sequencia_predict = np.array(tokenizer.texts_to_sequences( [texto] ))
print("sequencia_predict: ", sequencia_predict)
sequencia_predict_padded = pad_sequences( sequencia_predict, maxlen=max_size_entrada, padding="pre", value=0 )
print("sequencia_predict_padded:", sequencia_predict_padded)
previsao = modelo.predict( [sequencia_predict_padded] )
print("Previsao: ", previsao)
print("Previsao Shape: ", previsao.shape)
word_index = np.argmax(previsao)
print("Previsao Word Index: ", )

for item in vocabulario.items():
    if item[1] == word_index:
        print("Palavra: ", item[0])
        break


sequencia_predict:  [[   7    2   32  187 1351    3  562 2515    1]]
sequencia_predict_padded: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    7    2   32  187 1351    3  562 2515    1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
Previsao:  [[3.2466636e-09 8.1433080e-02 1.3924631e-02 ... 3.2764657e-08
  1.9850326e-08 1.1071960e-07]]
Previsao Shape:  (1, 3000)
Previsao Word Index: 
Palavra:  <OOV>
