In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

In [5]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['label'] = pd.get_dummies(df['label'], drop_first=True)

In [7]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data = df['message']

In [9]:
#Cleaning Data

import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [10]:
corpus = []
ps = PorterStemmer()

for i in range(len(data)):
    msg = re.sub('[^A-Za-z]', ' ', data[i])
    msg = msg.lower()
    msg = msg.split()
    msg = [ps.stem(word) for word in msg if word not in stopwords.words('english')]
    msg = ' '.join(msg)
    corpus.append(msg)

In [11]:
len(corpus)

5572

In [12]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [13]:
vocab_size = 10000

In [14]:
from tensorflow.keras.preprocessing.text import one_hot

In [15]:
one_hot_rep = [one_hot(word, vocab_size) for word in corpus]
print(one_hot_rep)

[[4305, 2787, 885, 2267, 1517, 1379, 3714, 3744, 7401, 1067, 6565, 1877, 2215, 5354, 2496, 2611], [3724, 2569, 299, 3760, 1299, 4138], [9221, 4179, 7000, 3985, 8388, 3866, 7354, 6092, 8866, 9969, 6081, 5190, 3866, 6760, 4179, 9503, 9564, 6441, 2102, 7404, 4857], [1299, 752, 7364, 8833, 7276, 1299, 7404, 6741, 7364], [5216, 9242, 8195, 4903, 2780, 7547, 4258], [1841, 3267, 2982, 3940, 7162, 6776, 7090, 868, 2028, 2104, 3724, 8313, 9564, 6235, 4735, 9801], [5675, 6283, 7090, 9744, 1522, 7090, 5265, 1635], [4772, 4243, 3726, 3726, 8121, 2385, 4104, 7141, 2396, 3995, 2220, 1731, 4829, 5792, 3995], [7725, 1658, 9648, 9062, 3098, 2690, 233, 605, 9003, 8699, 9003, 4929, 6237, 5929, 5392], [4027, 6499, 1299, 2964, 4156, 7005, 4724, 5449, 4027, 8699, 9221, 8699, 4027, 7005, 6661, 9221], [8022, 6512, 6395, 9543, 3208, 4537, 9073, 8430, 2776, 9962, 4779, 2670], [3539, 8384, 8388, 34, 5380, 6441, 3552, 4735, 3197, 7532, 6670, 6670, 6987, 4857, 3970, 3346, 1347], [2553, 3940, 9221, 8161, 233, 1059,

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
sent_len = 20
embedded = pad_sequences(one_hot_rep,maxlen=sent_len)
print(embedded)

[[   0    0    0 ... 5354 2496 2611]
 [   0    0    0 ... 3760 1299 4138]
 [4179 7000 3985 ... 2102 7404 4857]
 ...
 [   0    0    0 ...  314 3095 7887]
 [   0    0    0 ... 1130 9993 9221]
 [   0    0    0 ... 1260  596  190]]


In [18]:
X = np.array(embedded)
y = df['label'].values

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
print(X_train.shape)
print(y_train.shape)

(4457, 20)
(4457,)


In [22]:
np.unique(y_train)

array([0, 1], dtype=uint8)

In [23]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [24]:
from tensorflow.keras.models import Sequential

In [25]:
feature_vector = 20

model = Sequential()
model.add(Embedding(vocab_size, feature_vector, input_length=sent_len))
#model.add(Dropout(0.5))
model.add(LSTM(100))
#model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 20)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               48400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 248,501
Trainable params: 248,501
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(X_train,y_train, batch_size=50, epochs=10, validation_data=(X_test,y_test))

Train on 4457 samples, validate on 1115 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x12bae988>

In [28]:
y_pred = model.predict_classes(X_test)

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [30]:
print(confusion_matrix(y_test,y_pred))
print()
print(accuracy_score(y_test,y_pred))
print()
print(classification_report(y_test,y_pred, zero_division=1))

[[960   5]
 [ 10 140]]

0.9865470852017937

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.97      0.93      0.95       150

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [33]:
model.save('nlp_model.h5')