In [32]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout
from tensorflow.keras.models import Model

In [16]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

In [17]:
spam = pd.read_csv('spam.csv')
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
spam.shape

(5572, 2)

In [19]:
count = spam['Category'].value_counts()
print(count)

Category
ham     4825
spam     747
Name: count, dtype: int64


In [20]:
ham_samples = spam[spam["Category"]=='ham'].sample(n=747, random_state=42)
spam_samples = spam[spam["Category"]=='spam']

spam = pd.concat([ham_samples,spam_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
spam.shape

(1494, 2)

In [22]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam["Category"])

In [24]:
y

array([1, 1, 1, ..., 1, 1, 1])

In [26]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3, random_state=43)

In [27]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [28]:
print(X_train)

[[404, 404, 101, 11, 46, 880], [702, 702, 102, 453, 1, 5, 87, 2, 302, 22, 302], [610, 881, 83, 87, 48, 2, 77, 22, 1, 148, 453], [256, 36, 8, 60, 22], [50, 272, 1, 80, 2, 362, 7, 1, 43, 192, 21, 3, 51, 58, 882, 12, 273, 883, 32, 15, 6, 274, 9, 193], [204, 129, 13, 35, 20, 405, 257, 21, 454, 231, 28, 114, 1, 510, 10, 275, 511, 276, 27, 71, 107, 884, 277, 363, 406, 88, 19, 278, 15, 29], [139, 407, 6, 885, 1, 35, 7, 81, 124, 53, 15, 205, 303, 304, 1, 3, 149, 337, 10], [4, 125, 112, 703, 455, 338, 21, 886, 16, 102, 1, 704, 4, 102, 3, 887, 305, 170, 304, 306, 888, 21, 84], [232, 217, 4, 218, 705, 21, 12], [150, 206, 408, 51, 456, 364, 1, 7, 27, 126, 61], [22, 13, 307, 4, 512, 5, 409, 33, 115, 46, 889, 308, 96, 47, 8], [10, 19, 410, 15, 20, 171, 457, 411, 410, 108, 1, 184, 1, 339, 12, 30], [154, 34, 17, 11, 17, 52, 233, 340, 11, 155, 52, 116, 46, 4, 185, 2, 3, 194, 67, 2, 309, 5, 458, 97, 341, 170, 9, 5], [172, 611, 4, 18, 1], [20, 53, 513, 11, 514, 207, 310, 1, 706, 20, 53, 17, 10, 44, 53, 1

In [30]:
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test  = pad_sequences(X_test, padding='post', maxlen=500)

In [33]:
input_layer = Input(shape=(500,))
embedding_layer = Embedding(input_dim=len(token.word_index), output_dim=50)(input_layer)
flatten_layer = Flatten()(embedding_layer)
dense_layer = Dense(units=10, activation='relu')(flatten_layer)
dropout_layer = Dropout(0.1)(dense_layer)
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)
modelo = Model(inputs=input_layer, outputs=output_layer)

In [34]:
modelo.compile(loss='mean_squared_error', optimizer='adam', metrics=["accuracy"])

In [36]:
modelo.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test,y_test))

Epoch 1/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.5944 - loss: 0.2463 - val_accuracy: 0.8285 - val_loss: 0.1612
Epoch 2/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8213 - loss: 0.1251 - val_accuracy: 0.9220 - val_loss: 0.0724
Epoch 3/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8996 - loss: 0.0721 - val_accuracy: 0.9555 - val_loss: 0.0480
Epoch 4/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9262 - loss: 0.0489 - val_accuracy: 0.9688 - val_loss: 0.0369
Epoch 5/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9126 - loss: 0.0465 - val_accuracy: 0.9688 - val_loss: 0.0343
Epoch 6/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9392 - loss: 0.0395 - val_accuracy: 0.9666 - val_loss: 0.0323
Epoch 7/20
[1m105/105[0m 

<keras.src.callbacks.history.History at 0x7a5abf473010>

In [37]:
loss, accuracy = modelo.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Acurácia: ", accuracy)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9556 - loss: 0.0330
Loss:  0.02971014752984047
Acurácia:  0.9621380567550659


In [38]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao[2:5])

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[[0.99242294]
 [0.01563588]
 [0.00687532]]


In [39]:
prev = (nova_previsao > 0.5 )
print(prev[2:5])

[[ True]
 [False]
 [False]]


In [40]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[225   7]
 [ 10 207]]
