# SPAM CLASSIFICATION with LSTM Network in Keras

### Reading the data

In [3]:
import pandas as pd 
import numpy as np
from keras.layers import Dense, LSTM, Embedding
from keras.models import Sequential

Using TensorFlow backend.


In [4]:
raw_data = pd.read_csv('./spam_train.csv', encoding='latin-1') 
raw_test_data = pd.read_csv('./spam_test.csv', encoding='latin-1')

print(raw_data.shape) 
print(raw_data.columns) 
print('\n')
print(raw_data.head(5)) 

(29000, 2)
Index(['Label', 'Message'], dtype='object')


  Label                                            Message
0   ham                 oh how abt 2 days before Christmas
1  info  Welcome to OVATION HOLD R.No. 184, 114, 395, 3...
2  info  Thank you for using your ICICI bank CREDITcard...
3   ham  schedule a meeting with the entire team in the...
4   ham                                Tommy is my brother


### Check the labels and their frequencies

In [5]:
classes = np.unique(raw_data['Label'], return_counts=True)
print(classes[0]) 
print(classes[1]) 

['ham' 'info' 'spam']
[ 9666 12916  6418]


### Conver text to fixed length sequence

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.Message) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(raw_data.Message) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=50) #pad_sequences makes every sequence a fixed size list by padding with 0s 
x_test = tokenizer.texts_to_sequences(raw_test_data.Message) 
x_test = pad_sequences(x_test, maxlen=50)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((29000, 50), (1000, 50))

### Prepare the target vectors for the network

In [7]:
from keras.utils.np_utils import to_categorical 
unique_labels = list(raw_data.Label.unique()) 
y_train = np.array([unique_labels.index(i) for i in raw_data.Label]) 
y_train = to_categorical(y_train) 
y_test = np.array([unique_labels.index(i) for i in raw_test_data.Label])
y_test = to_categorical(y_test)
y_test.shape

(1000, 3)

In [8]:
import keras.backend as K 

def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

### Building and training an LSTM model

In [9]:
# Student assignment starts here

# Building an LSTM model with Keras Sequential.
# Add an Embedding Layer with input_dim as length of tokenizer word index, input length as 50 and output dimensions as 100. 
# Add just one layer of LSTM to Keras model with 10 units and one Dense Layer with 3 units with softmax activation.
# Do a Keras Compile
# Do a Keras fit with just 1 Epoch and validation_split=0.25
# Predict on Test Dataset and print the accuracy on the test dataset

In [19]:
print(len(tokenizer.index_word))

38393


In [20]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.index_word),output_dim=100,input_length=50))
model.add(LSTM(units=10))
model.add(Dense(units=3,activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [22]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

In [23]:
#training Model
model.fit(x_train,y_train,epochs=1,validation_split=0.25)

Instructions for updating:
Use tf.cast instead.
Train on 21750 samples, validate on 7250 samples
Epoch 1/1


<keras.callbacks.History at 0x1d2f2a4ca88>

In [30]:
# Making Predictions on Test Samples
preds = model.predict(x_test)

In [31]:
# Calculating Accuracy
print(len(preds))
print(np.shape(preds))
print(np.shape(y_test))
print(np.sum(np.multiply(preds,y_test)))
num= float(np.sum(np.multiply(preds,y_test)))
den = float(len(y_test))
print('Test Acc.: %.3f' % (num/den))

1000
(1000, 3)
(1000, 3)
989.22437
Test Acc.: 0.989
