In [1]:
import pandas as pd

df = pd.read_csv('/kaggle/input/email-spam-classification-dataset/combined_data.csv')
df

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [2]:
df.isnull(), df.isnull().sum()

(       label   text
 0      False  False
 1      False  False
 2      False  False
 3      False  False
 4      False  False
 ...      ...    ...
 83443  False  False
 83444  False  False
 83445  False  False
 83446  False  False
 83447  False  False
 
 [83448 rows x 2 columns],
 label    0
 text     0
 dtype: int64)

In [3]:
import string

def remove_special_characters(word):
    return word.translate(str.maketrans('', '', string.punctuation))

df['no_punctuation'] = df['text'].apply(remove_special_characters)
df

Unnamed: 0,label,text,no_punctuation
0,1,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...,thanks for all your answers guys i know i shou...
...,...,...,...
83443,0,hi given a date how do i get the last date of ...,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...,subscribe change profile contact us long term ...


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['no_punctuation'], df['label'])

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
max_words = 5000 # top frequent unique words to keep
max_len = 500 # Maximum length per padding

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['no_punctuation'])

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

2025-05-02 11:18:31.532746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746184711.950168      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746184712.078433      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# Pad sequences to make them equal length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

y is no need of encoding because it is of 0/1

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Input(shape=(max_len,)),  # Explicit input shape
    Embedding(input_dim=max_words, output_dim=128),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    LSTM(64),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', 'Precision', 'Recall']
)

model.summary()

I0000 00:00:1746184753.491092      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1746184753.491858      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [8]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_pad, 
    y_train,
    validation_data=(X_test_pad, y_test),
    batch_size=64,
    epochs=20,
    callbacks=[early_stop],
    verbose=1
)

y_pred_nn = model.predict(X_test_pad)
y_pred_nn = (y_pred_nn > 0.5).astype(int).flatten()

Epoch 1/20


I0000 00:00:1746184761.664890      92 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m978/978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 75ms/step - Precision: 0.5631 - Recall: 0.8671 - accuracy: 0.5751 - loss: 0.6545 - val_Precision: 0.9070 - val_Recall: 0.9766 - val_accuracy: 0.9350 - val_loss: 0.1853
Epoch 2/20
[1m978/978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 78ms/step - Precision: 0.9444 - Recall: 0.9508 - accuracy: 0.9442 - loss: 0.1795 - val_Precision: 0.9779 - val_Recall: 0.9741 - val_accuracy: 0.9748 - val_loss: 0.0847
Epoch 3/20
[1m978/978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 78ms/step - Precision: 0.9751 - Recall: 0.9844 - accuracy: 0.9787 - loss: 0.0785 - val_Precision: 0.9737 - val_Recall: 0.9912 - val_accuracy: 0.9813 - val_loss: 0.0658
Epoch 4/20
[1m978/978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 78ms/step - Precision: 0.9726 - Recall: 0.9857 - accuracy: 0.9778 - loss: 0.0749 - val_Precision: 0.9775 - val_Recall: 0.9887 - val_accuracy: 0.9821 - val_loss: 0.0659
Epoch 5/20
[1m978/978[0m 

In [9]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
model.save('model.keras')