# Deep Learning for Email Classification with LSTM and Word2Vec

## Task 1: Import Libraries

In [3]:
pip install gensim


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Task 2: Load the Dataset

In [6]:
df = pd.read_csv('Dataset.csv', encoding='latin1')
print(df.head())

  Label                                              Email
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


## Task 3: Extract Email Texts and Labels

In [7]:
texts = df['Email'].tolist()
labels = df['Label'].map({'ham':0,'spam':1}).tolist()
print("Total no. of spam emails:", sum(labels))
print("Total no. of ham emails:", len(labels) - sum(labels))

Total no. of spam emails: 747
Total no. of ham emails: 4825


## Task 4: Split the Dataset

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Task 5: Tokenize and Pad Sequences

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train + X_val + X_test)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
sequences_test = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max([len(seq) for seq in sequences_train + sequences_val + sequences_test])
vocab_size = len(tokenizer.word_index) + 1

data_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
data_val = pad_sequences(sequences_val, maxlen=max_sequence_length)
data_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

## Task 6: Train a Word2Vec Model

In [10]:
sentences = [text.split() for text in X_train + X_val + X_test]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

## Task 7: Prepare the Embedding Matrix

In [11]:
embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

## Task 8: Build an LSTM Model

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()



## Task 9: Compile the Model

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Task 10: Train the Model

In [14]:
model.fit(data_train, np.array(y_train), epochs=10, batch_size=32, validation_data=(data_val, np.array(y_val)))

Epoch 1/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 126ms/step - accuracy: 0.8629 - loss: 0.3454 - val_accuracy: 0.9139 - val_loss: 0.2240
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 139ms/step - accuracy: 0.8965 - loss: 0.2531 - val_accuracy: 0.9151 - val_loss: 0.2056
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 138ms/step - accuracy: 0.9070 - loss: 0.2305 - val_accuracy: 0.9103 - val_loss: 0.2203
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 146ms/step - accuracy: 0.9085 - loss: 0.2237 - val_accuracy: 0.9199 - val_loss: 0.2034
Epoch 5/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 130ms/step - accuracy: 0.9076 - loss: 0.2265 - val_accuracy: 0.9175 - val_loss: 0.2048
Epoch 6/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 134ms/step - accuracy: 0.9014 - loss: 0.2522 - val_accuracy: 0.9163 - val_loss: 0.1978
Epoch 7/10

<keras.src.callbacks.history.History at 0x31a1c6e10>

## Task 11: Evaluate the Model

In [15]:
evaluation_results = model.evaluate(data_test, np.array(y_test))
print("Test Loss:", evaluation_results[0])
print("Test Accuracy:", evaluation_results[1])

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.8966 - loss: 0.2375
Test Loss: 0.24403464794158936
Test Accuracy: 0.8959330320358276


## Task 12: Generate Predictions

In [16]:
predictions = model.predict(data_test)
predictions = (predictions > 0.5).astype(int)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step


## Task 13: Print the Classification Report

In [17]:
print("Classification Report:")
print(classification_report(np.array(y_test), predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       724
           1       0.65      0.47      0.55       112

    accuracy                           0.90       836
   macro avg       0.79      0.72      0.75       836
weighted avg       0.89      0.90      0.89       836

