In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

Using TensorFlow backend.


In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1')
df = df[['v1','v2']]
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
for i in range(df.shape[0]):
    if df.iloc[i,0]=="spam":
        df.iloc[i,0] = 1
    else:
        df.iloc[i,0] = 0

In [4]:
df.describe()


Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,0,"Sorry, I'll call later"
freq,4825,30


In [5]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [6]:
df['v2'] = df['v2'].map(lambda x: clean_text(x))

In [7]:
df

Unnamed: 0,v1,v2
0,0,jurong point crazi avail bugi great world buff...
1,0,lar joke wif oni
2,1,free entri wkli comp win cup final tkts 21st m...
3,0,dun say earli hor alreadi say
4,0,nah think goe usf live around though
5,1,freemsg hey darl week word back ! i would like...
6,0,even brother like speak me treat like aid patent
7,0,per request mell mell oru minnaminungint nurun...
8,1,winner ! ! valu network custom select receivea...
9,1,mobil month more entitl updat latest colour mo...


In [8]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['v2'])

sequences = tokenizer.texts_to_sequences(df['v2'])
data = pad_sequences(sequences, maxlen=50)

In [9]:
data.shape

(5572, 50)

In [19]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [20]:
model_conv = create_conv_model()
model_conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 46, 64)            32064     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 11, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 2,098,165
Trainable params: 2,098,165
Non-trainable params: 0
_________________________________________________________________


In [21]:
history = model_conv.fit(data[:4000], np.array(df.iloc[:4000,0]), epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:
def cnfmatrix(y_test,results):
    fp = 0.0
    fn = 0.0
    tp = 0.0
    tn = 0.0
    results.shape
    for i in range(results.shape[0]):
        if y_test[i]==1 and results[i]==1:
            tp+=1
        elif y_test[i]==1 and results[i]==0:
            fn+=1
        elif y_test[i]==0 and results[i]==1:
            fp+=1
        elif y_test[i]==0 and results[i]==0:
            tn+=1
    print(tp/results.shape[0],fp/results.shape[0])
    print(fn/results.shape[0],tn/results.shape[0])
    Precision  = tp/(tp+fp)
    Recall = tp/(tp+fn)
    print("Precision: ",Precision,"Recall: ",Recall)
    f1score = (2*Precision*Recall)/(Precision+Recall)
    print("f1score: ",f1score)
    print("accuracy: ",(tp+tn)/results.shape[0])

In [23]:
result = model_conv.predict(data[4000:], batch_size=None, verbose=0, steps=None)

In [25]:
data[4000:]

array([[   0,    0,    0, ...,   71,  163,  114],
       [   0,    0,    0, ...,    3,  120,  317],
       [   0,    0,    0, ...,   99,  281,  343],
       ...,
       [   0,    0,    0, ...,  276, 1525, 1483],
       [   0,    0,    0, ...,   34,  886,    8],
       [   0,    0,    0, ..., 2274,  377,  178]])

In [24]:
results = [i>0.5 for i in result]
cnfmatrix(np.array(df.iloc[4000:,0]),np.array(results))

0.1272264631043257 0.0057251908396946565
0.007633587786259542 0.8594147582697201
Precision:  0.9569377990430622 Recall:  0.9433962264150944
f1score:  0.9501187648456059
accuracy:  0.9866412213740458


In [16]:
model_conv.save('models/sms_classifier.h5')