In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
data = pd.read_csv('spam_ham_dataset.csv')
X = data['text']
y = data['label']
y = y.map({'ham': 0, 'spam': 1})

In [14]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_tr)
X_tr_seq = tokenizer.texts_to_sequences(X_tr)
X_te_seq = tokenizer.texts_to_sequences(X_te)
max_seq_len = 100
X_tr_pad = pad_sequences(X_tr_seq, maxlen=max_seq_len, padding='post')
X_te_pad = pad_sequences(X_te_seq, maxlen=max_seq_len, padding='post')

In [16]:
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_tr_pad, y_tr, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x78bd259d00a0>

In [17]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          6625664   
                                                                 
 lstm_4 (LSTM)               (None, 100, 64)           49408     
                                                                 
 dropout_2 (Dropout)         (None, 100, 64)           0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6,708,161
Trainable params: 6,708,161
Non-trainable params: 0
_________________________________________________________________


In [18]:
loss, acc = model.evaluate(X_te_pad, y_te)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {acc:.4f}")


Test Loss: 0.1547
Test Accuracy: 0.9700


In [19]:
email_text = input("Enter an email text: ")

sequence = tokenizer.texts_to_sequences([email_text])
padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')
prediction = model.predict(padded_sequence)

if prediction > 0.5:
    print("Prediction: Spam")
else:
    print("Prediction: Ham")

Enter an email text: you won 1 mill
Prediction: Spam
