In [1]:
import numpy as np

In [2]:
import tensorflow as tf

In [6]:
import pandas as pd

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [87]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [134]:
from tensorflow.keras.layers import CategoryEncoding

In [136]:
from sklearn.preprocessing import LabelEncoder

In [148]:
from sklearn.model_selection import train_test_split

In [176]:
from tensorflow.keras.models import Sequential

In [182]:
from tensorflow.keras.layers import Input, Dense

In [290]:
from tensorflow.keras.optimizers import SGD, Adam, RMSprop

## Importar os dados

In [9]:
df = pd.read_csv("../../../dados/nlp/news_sentiment_analysis.csv", encoding="utf-8")

In [91]:
# df

In [13]:
news_df = df[ ["Description", "Type"] ] 

In [117]:
news_df["Type"].unique()

array(['Business', 'Entertainment', 'General', 'Health', 'Science',
       'Sports', 'Technology'], dtype=object)

In [49]:
tokenizer = Tokenizer()

In [51]:
tokenizer.fit_on_texts( news_df["Description"] )

In [53]:
len(tokenizer.index_word)

20859

In [67]:
tokenizer.index_word[12892]

'kaitlyn'

In [57]:
sequencias = tokenizer.texts_to_sequences( news_df["Description"] )

In [71]:
len(sequencias)

3500

In [83]:
print("Tamanho Sequencia 0: ", len(sequencias[0]))
print("Tamanho Sequencia 1: ", len(sequencias[1]))
print("Tamanho Sequencia 2: ", len(sequencias[2]))

Tamanho Sequencia 0:  58
Tamanho Sequencia 1:  15
Tamanho Sequencia 2:  77


In [85]:
max_features = 0
for sequencia in sequencias:
    if len(sequencia) > max_features:
        max_features = len(sequencia)
max_features

112

In [95]:
sequences_padded = pad_sequences( sequencias, padding="post" )

In [97]:
print("Tamanho Sequencia 0: ", len(sequences_padded[0]))
print("Tamanho Sequencia 1: ", len(sequences_padded[1]))
print("Tamanho Sequencia 2: ", len(sequences_padded[2]))

Tamanho Sequencia 0:  112
Tamanho Sequencia 1:  112
Tamanho Sequencia 2:  112


In [99]:
sequences_padded

array([[  598,  6262,   117, ...,     0,     0,     0],
       [12897,  3561,   211, ...,     0,     0,     0],
       [   28,    22,  2751, ...,     0,     0,     0],
       ...,
       [ 1070,   216,   231, ...,     0,     0,     0],
       [ 2980,   933,    31, ...,     0,     0,     0],
       [ 1070,   216,   231, ...,     0,     0,     0]])

## Codificando as saidas

['Business', 'Entertainment', 'General', 'Health', 'Science',
       'Sports', 'Technology']

Business = [ 1, 0, 0, 0, 0, 0, 0 ]
Entertainment = [ 0, 1, 0, 0, 0, 0, 0 ]
General = [ 0, 0, 1 , 0, 0, 0, 0 ]

In [138]:
output_encoder = LabelEncoder()

In [142]:
output_numeric = output_encoder.fit_transform( news_df["Type"] )
output_numeric

array([0, 0, 0, ..., 6, 6, 6])

In [144]:
output_encoder = CategoryEncoding(num_tokens=7, output_mode="one_hot")
output_encoded = output_encoder( output_numeric )  # Não funciona precisa transformar para numeros primeiro
output_encoded

<tf.Tensor: shape=(3500, 7), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)>

In [270]:
X = sequences_padded
print("X Shape: ", X.shape)
Y = output_encoded.numpy()
print("Y Shape: ", Y.shape)

X Shape:  (3500, 112)
Y Shape:  (3500, 7)


In [172]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, random_state = 100 )

In [272]:
modelo = Sequential()
modelo.add( Input( (112,) ) )
modelo.add( Dense( 128, activation="relu" ) )
modelo.add( Dense( 7, activation="softmax" ) )
modelo.summary()

In [292]:
# opt = SGD(learning_rate = 0.000001)
# opt = Adam(learning_rate = 0.000001)
opt = RMSprop(learning_rate=0.001)
modelo.compile(optimizer = opt, loss="categorical_crossentropy", metrics=["accuracy"])

In [294]:
resultado = modelo.fit(X_train, Y_train, epochs=100, batch_size=32)

Epoch 1/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3733 - loss: 238.9330
Epoch 2/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4658 - loss: 159.5372
Epoch 3/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4856 - loss: 135.1796
Epoch 4/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5188 - loss: 118.0268
Epoch 5/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5491 - loss: 95.7308
Epoch 6/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5886 - loss: 80.5626
Epoch 7/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6489 - loss: 61.0541
Epoch 8/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6451 - loss: 61.2569
Epoch 9/100
[1m83/83[0m [32m━━━━━

In [296]:
resultado_teste = modelo.evaluate(X_test, Y_test)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4803 - loss: 483.5494  


In [308]:
frase = "We’ve had a ‘constitutional dictatorship’ before. Trump is different."
sequencia_predict = np.array(tokenizer.texts_to_sequences( [frase] ))
print("sequencia_predict: ", sequencia_predict.shape)
sequencia_predict_padded = pad_sequences( sequencia_predict, maxlen=max_features, padding="post" )
print("sequencia_predict_padded:", sequencia_predict_padded.shape)

sequencia_predict:  (1, 7)
sequencia_predict_padded: (1, 7)
