In [None]:
import pandas as pd
import numpy as np
import nltk


In [None]:
df = pd.read_csv(r"/content/drive/My Drive/Spam_SMS_dataset/SMSSpamCollection", sep="\t", names = ['labels', 'message'] )

In [None]:
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

In [None]:
corpus = []

for i in range(len(df)):
  review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
  review = review.lower()
  review = review.split()
  review = [lm.lemmatize(word) for word in review if word not in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)




In [None]:
#creating bag of words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4000)
X = cv.fit_transform(corpus).toarray()

In [None]:
y = pd.get_dummies(df['labels'])
y = y.iloc[:,1]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)


In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect = MultinomialNB().fit(X_train, y_train)


In [None]:
y_spam =spam_detect.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_spam)

0.016143497757847534

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_spam)

array([[946,   9],
       [  9, 151]])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_spam)

0.9838565022421525

### **We have got 98.33 percent using Naive Bayes Classifier Now We'll try with LSTM technique.**



In [None]:
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tf.__version__

'2.2.0'

In [None]:
voc_size = 10000


In [None]:
onehot = [one_hot(words, voc_size) for words in corpus]
onehot

[[4754,
  3671,
  6134,
  3038,
  1406,
  4536,
  7312,
  4341,
  5320,
  6303,
  2790,
  5115,
  8795,
  6971,
  8992,
  7301],
 [9878, 1100, 3256, 6772, 5655, 3392],
 [956,
  3765,
  6583,
  5877,
  2576,
  3028,
  6717,
  8507,
  5122,
  5870,
  217,
  3115,
  3028,
  6366,
  3765,
  5352,
  3243,
  4832,
  1700,
  6933,
  4490],
 [5655, 6632, 2126, 7129, 5031, 5655, 6933, 5811, 2126],
 [3320, 8808, 4754, 1778, 3635, 7030, 8620],
 [1247,
  6976,
  9677,
  4003,
  9948,
  7725,
  6965,
  2068,
  6662,
  2498,
  9878,
  9546,
  3243,
  4082,
  839,
  9659],
 [4864, 4121, 6965, 2253, 8285, 6965, 2568, 5013],
 [4113,
  684,
  1016,
  1016,
  3819,
  9276,
  6959,
  1435,
  6890,
  4934,
  601,
  132,
  6181,
  3735,
  4934],
 [508,
  7138,
  1390,
  7009,
  6359,
  9553,
  7210,
  8749,
  6577,
  6919,
  6577,
  2176,
  2935,
  8105,
  194],
 [6760,
  5298,
  5655,
  6779,
  9299,
  3203,
  741,
  5499,
  6760,
  6236,
  956,
  6919,
  6760,
  3203,
  4911,
  956],
 [9645, 8486, 1115, 4

In [None]:
# Embedding Representation

sent_length = 150 
embedded_docs = pad_sequences(onehot, padding = 'pre', maxlen = sent_length)
print(embedded_docs)

[[   0    0    0 ... 6971 8992 7301]
 [   0    0    0 ... 6772 5655 3392]
 [   0    0    0 ... 1700 6933 4490]
 ...
 [   0    0    0 ... 6699 4271 6695]
 [   0    0    0 ... 3566 5655  956]
 [   0    0    0 ... 9404 3334 3785]]


In [None]:
embedding_vector_feature = 60

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_feature, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics =['accuracy'])
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 60)           600000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               64400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 664,501
Trainable params: 664,501
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
import numpy as np
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [None]:
 # splitting into train and test data

 from sklearn.model_selection import train_test_split

 X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.2, random_state=42)

In [None]:
model.fit(X_train, y_train, epochs = 15, batch_size = 64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f80855386a0>

In [None]:
ypred = model.predict_classes(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, ypred)

array([[965,   1],
       [  8, 141]])

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, ypred)

0.9919282511210762

In [None]:
# accuracy of LSTM is 99 pecent