In [61]:
import pandas as pd

df = pd.read_csv('/home/abhishek/Documents/NLP/Blog/spam-ham-classifier/spam.csv')

In [62]:
df.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'],axis = 1, inplace = True)

In [63]:
df.rename(index=str, columns={"v1":"label","v2":"msg_text"},inplace = True)

In [64]:
df.head()

Unnamed: 0,label,msg_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
df.loc[df['label'] == 'spam', 'label'] = 1

In [66]:
df.loc[df['label'] == 'ham', 'label'] = 0

In [67]:
df.head()

Unnamed: 0,label,msg_text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [71]:
X = df.msg_text.tolist()
Y = df.label.tolist()

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [87]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

top_words = 1000
MAX_LEN = 150

tokenizer = Tokenizer(num_words = top_words)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
padded_sequences = sequence.pad_sequences(sequences, maxlen = MAX_LEN)

In [88]:
padded_sequences

array([[  0,   0,   0, ..., 809, 573, 228],
       [  0,   0,   0, ..., 476, 433,  87],
       [  0,   0,   0, ...,   4,  40,  64],
       ...,
       [  0,   0,   0, ...,  95, 513,  66],
       [  0,   0,   0, ...,  83,  15, 106],
       [  0,   0,   0, ...,  34,  70, 851]], dtype=int32)

In [107]:
from keras.layers import Input, Embedding, LSTM, Activation, Dense, Dropout
from keras.models import Model

inputs = Input(name = 'inputs',shape=[MAX_LEN])
print(inputs.shape)

embeddings = Embedding(top_words,50, input_length = MAX_LEN)(inputs)
print(embeddings.shape)

NN_output = LSTM(64)(embeddings)

NN_output = Dense(256, name = 'dense_layer_1', activation = 'relu')(NN_output)

NN_output = Dropout(0.5)(NN_output)

outputs = Dense(1,name = 'output_layer', activation = 'sigmoid')(NN_output)
print(outputs.shape)

model = Model(inputs=inputs,outputs=outputs)

(?, 150)
(?, 150, 50)
(?, 1)


In [108]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 150, 50)           50000     
_________________________________________________________________
lstm_15 (LSTM)               (None, 64)                29440     
_________________________________________________________________
dense_layer_1 (Dense)        (None, 256)               16640     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
output_layer (Dense)         (None, 1)                 257       
Total params: 96,337
Trainable params: 96,337
Non-trainable params: 0
_________________________________________________________________


In [110]:
model.compile(loss = 'binary_crossentropy', optimizer = 'RMSprop', metrics = ['accuracy'])

In [112]:
model.fit(padded_sequences,Y_train,batch_size=128,epochs=10,validation_split=0.2)

Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb2366add90>

In [113]:
test_sequences = tokenizer.texts_to_sequences(X_test)
padded_test_sequences = sequence.pad_sequences(test_sequences, maxlen = MAX_LEN)

In [114]:
accr = model.evaluate(padded_test_sequences, Y_test)



In [115]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.116
  Accuracy: 0.984
