In [20]:
import numpy as np 
import pandas as pd
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import re

In [2]:
dataset = pd.read_csv("D://Datasets//spam.csv", encoding='ISO-8859-1')

In [3]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [5]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
dataset.rename(columns={'v1':'target','v2':'message'},inplace=True)

In [7]:
dataset.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
encoder = LabelEncoder()
y = dataset['target']
x = dataset['message']

In [9]:
y_encoded = encoder.fit_transform(y)

In [10]:
stemmer = PorterStemmer()

Preprocessing the data

In [28]:
text = []

for i in range(len(dataset)):
    sentence = re.sub('[^a-zA-z]',' ',x[i])
    sentence = sentence.lower()
    sentence = sentence.split(' ')
    sentence = [stemmer.stem(word) for word in sentence if word not in set(stopwords.words('english'))]
    while '' in sentence:
        sentence.remove('')
    sentence = ' '.join(sentence)
    text.append(sentence)
    

In [29]:
temp = []
for sentence in text:
    sentence = sentence.split(' ')
    temp.append(len(sentence))
    
print(max(temp))

79


Tokenisation

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
text_sequences = tokenizer.texts_to_sequences(text)

In [31]:
max_seq_len = 79
padding_sequence = pad_sequences(text_sequences,maxlen=80)

In [32]:
x_train,x_test,y_train,y_test = train_test_split(padding_sequence,y_encoded,test_size=0.2,random_state=0)
x_train,x_valid,y_train,y_valid = train_test_split(x_train,y_train,test_size=0.2,random_state=0)

Creating LSTM model

In [54]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=200,input_length=80))
model.add(LSTM(100))
model.add(Dense(50))
model.add(Dense(1,activation='sigmoid'))

In [55]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 80, 200)           1254200   
                                                                 
 lstm_5 (LSTM)               (None, 100)               120400    
                                                                 
 dense_10 (Dense)            (None, 50)                5050      
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 1,379,701
Trainable params: 1,379,701
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
history = model.fit(x_train,y_train, batch_size=32, epochs=10, validation_data=(x_valid,y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
model.save('D:\\vs code\python\DeepLearning\\NLP\projects\emailSpamDetector\model.h5', history)

In [57]:
y_pred = model.predict(x_test)



In [63]:
# Apply threshold of 0.5 for binary classification
threshold = 0.5
binary_predictions = (y_pred >= threshold).astype(int)
cm = confusion_matrix(y_test, binary_predictions)
tn, fp, fn, tp = cm.ravel()

# Calculate accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)

Confusion Matrix:
[[947   2]
 [ 12 154]]
Accuracy: 0.9874439461883409
