#SMS SPAM Text Classification using LSTM
Download the file https://www.kaggle.com/uciml/sms-spam-collection-dataset/data

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving spam.csv to spam.csv
User uploaded file "spam.csv" with length 503663 bytes


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [4]:
df = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y)
Y = Y.reshape(-1,1)
print(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

[0 0 1 ... 0 0 0]
[[0]
 [0]
 [1]
 ...
 [0]
 [0]
 [0]]


In [8]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
print(sequences)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix)

[[130, 26, 509, 71, 235, 81, 9, 59, 38, 134, 57, 165], [284, 35, 4, 357, 173, 196, 43, 37, 267, 164, 25], [3, 183, 13, 43, 304, 26, 583, 609, 510, 16, 22, 610, 787, 200, 208, 611], [48, 488, 70, 213, 2, 12, 4, 668, 26, 70, 36, 2, 12, 4, 489, 217, 272, 258, 161, 365, 48, 463, 21, 2, 74, 83], [34, 559, 381, 3], [135, 29, 511, 23, 29, 634, 15, 6, 23, 140, 82, 6, 153], [412, 37, 315, 669, 584, 986, 490, 585, 48, 464, 20, 82, 560, 535, 285, 74, 465, 2, 788, 154, 987], [358, 512, 1, 17, 7, 2, 3, 97, 177], [346, 19, 9, 156, 7, 191, 12, 4, 789, 44, 12, 120, 52, 27, 6, 108, 66, 9, 988, 1, 214, 12, 120, 906, 18], [26, 305, 31, 4, 359, 56, 28, 5, 413, 83, 18, 5, 251], [1, 34, 449, 115, 23, 95, 17, 19, 127, 84, 178, 3, 69, 14, 26, 5, 236, 138], [25, 75, 192, 77, 20, 635, 18, 989, 11, 696, 78, 20, 97, 636], [561, 15, 184, 202, 990, 12, 5, 24, 4, 70, 323], [1, 64, 245, 23, 16, 3, 159, 165, 97, 177, 340, 562, 6, 7, 10], [347, 465, 150, 28, 563, 132, 2, 50, 150, 54, 81, 6], [637, 637, 132, 243, 2, 5, 

In [9]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding (Embedding)       (None, 150, 50)           50000     
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257   

In [10]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
Epoch 2/10


<keras.callbacks.History at 0x7f5fd7d9f410>

In [11]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

accr = model.evaluate(test_sequences_matrix,Y_test)

print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Test set
  Loss: 0.061
  Accuracy: 0.980


In [13]:
Testing_context = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

txts = tok.texts_to_sequences(Testing_context)
txts = sequence.pad_sequences(txts, maxlen=max_len)

preds = model.predict(txts)
print(preds)

[[0.96754897]]
