<a href="https://colab.research.google.com/github/carloscesar182/ai_advanced_course/blob/main/Notebooks/NLP/NLPRna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout
from tensorflow.keras.models import Model

In [None]:
# definir seed pro numpy, tensorflow e random
# a ideia é tentar repetir o resultado se rodar esse notebook mais de uma vez em termos de performance
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

In [None]:
# importar e visualizar os dados
spam = pd.read_csv('spam.csv')
spam.head()

In [None]:
# ver o formato
spam.shape

In [None]:
# conferir categoria desbalanceada
count = spam['Category'].value_counts()
print(count)

In [None]:
# balanceamento de classes
ham_samples = spam[spam['Category'] == 'ham'].sample(n=747, random_state=42)
spam_samples = spam[spam['Category'] == 'spam']

spam = pd.concat([ham_samples, spam_samples]).sample(frac=1, random_state=42).reset_index(drop=True)
spam.shape

In [None]:
# criar e aplicar o label encoder na classe
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
y

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
# converter mensagens pro objeto numpy
mensagens = spam['Message'].values

# fazer tokenização
token = Tokenizer(num_words=1000)
token.fit_on_texts(mensagens)

# separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3, random_state=42)

### Aqui começa o PLN de fato

In [None]:
# usar o metedo text to sequence pra transformar numa caracterização numérica
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test) # temos que aplicar tbm nos dados de teste

In [None]:
print(X_train)

[[32, 3, 15, 28, 157, 415, 15, 857, 1, 218, 25, 74, 1, 102, 15, 596, 61, 39, 696, 697, 11, 308, 143, 108, 37], [341, 170, 36, 19, 416, 279, 20, 417, 171, 144, 25, 117, 1, 512, 10, 280, 513, 342, 28, 63, 103, 280, 698, 858, 281, 418, 78], [117, 699, 4, 71, 1, 68, 2, 118, 202, 7, 309, 9, 31], [2, 61, 29, 859, 219, 220, 79, 700, 29, 128, 31, 64, 72, 310, 12, 597, 219, 220, 282, 80, 65, 158, 159, 13, 145], [8, 16, 3, 514, 515, 75, 9, 598, 13, 221, 73, 22, 8, 109, 48, 75, 189, 111, 419, 75, 516, 19, 51, 239, 6, 18], [2, 24, 262, 283, 32, 7, 28, 284, 23, 42, 72, 6, 222, 468, 860, 420, 263, 861, 12, 37, 145], [112, 517, 41, 240, 39, 223, 57, 5, 224], [69, 50, 24, 190, 1, 73, 8, 203, 99, 138, 38, 2, 16, 55, 3, 225, 47, 85, 6, 23, 204, 124, 36, 191, 226, 37], [183, 227, 131, 50, 30, 74, 701, 264, 702, 11, 16, 703, 518], [343, 519, 599, 184, 1, 32, 17, 9, 38, 104], [90, 344, 285, 12, 380, 345, 600, 344, 862, 380, 345, 11, 468, 344, 863, 864, 863, 100, 346], [9, 3, 865, 41, 17, 172, 51, 381, 1], 

In [None]:
# aplicar o pad sequence pra transformar o nosso texto em tamanho padrão
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

# Convert to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

In [None]:
# criar a rede neural
# camada de entrada
input_layer = Input(shape=(500,))

# camada de embedding
embedding_layer = Embedding(input_dim=token.num_words + 1, output_dim=50)(input_layer)

# camada de achatamento
flatten_layer = Flatten()(embedding_layer) # achata a camada de entrada

# camada densa
dense_layer = Dense(units=10, activation='relu')(flatten_layer)

# camada de dropout pra reduzir overfitting
dropout_layer = Dropout(0.1)(dense_layer)

# camada de saída
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)

# criar o modelo
modelo = Model(inputs=input_layer, outputs=output_layer)

In [None]:
# compilar o modelo
modelo.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
# treinar o modelo
modelo.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test, y_test))

In [None]:
# gerar as metricas com os dados de teste
loss, accuracy = modelo.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4840 - loss: 0.2501 
Loss: 0.25001397728919983
Accuracy: 0.4966592490673065


In [None]:
# fazer uma nova previsão para os dados de teste
nova_previsao = modelo.predict(X_test)
print(nova_previsao[2:5])

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[[0.50167245]
 [0.50167245]
 [0.50167245]]


In [None]:
# transformar as previsões em valores booleanos
previsoes_bool = (nova_previsao > 0.5)
print(previsoes_bool[2:5])

[[ True]
 [ True]
 [ True]]


In [None]:
# gerar uma matriz de confusão
matriz_confusao = confusion_matrix(y_test, previsoes_bool)
print(matriz_confusao)

[[  0 226]
 [  0 223]]
