In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string

##Data Preperation (Cleaning)

In [83]:
nltk.download('stopwords')
stopwords= nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
msg= pd.read_csv('/content/spam.csv',encoding ='latin-1')
msg= msg.drop(labels= ['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
msg.columns= ['label', 'text']
labels= np.where(msg['label']=='spam',1,0)
msg.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [85]:
def clean_text(text):
  #Remove Punctuations
  text= ''.join([word.lower() for word in text
                  if word not in string.punctuation])
  #Tokenize
  tokens= re.split('\W+', text)

  #Remove Stopwords
  text= [word for word in tokens if word not in stopwords]

  return text

In [86]:
msg['clean_text'] = msg['text'].apply( lambda x: clean_text(x) ) 
msg.head()

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [87]:
x_train, x_test, y_train, y_test=  train_test_split( msg['clean_text'], msg['label'],
                                                    test_size= 0.2)

In [88]:
x_train[:10]

3446    [sitting, ard, nothing, lor, u, leh, busy, w, ...
1137                                          [lol, busy]
2087                                      [alright, babe]
169      [yes, completely, formclark, also, utter, waste]
4353    [important, information, 4, orange, user, 0789...
4527    [u, missed, u, havent, 2, much, bit, bored, ho...
4829    [word, checkmate, chess, comes, persian, phras...
1650    [dont, file, bagi, work, called, mei, tell, fi...
871     [going, goodno, problembut, still, need, littl...
1032                         [happy, new, year, no1, man]
Name: clean_text, dtype: object

In [89]:
y_train[:10]

3446     ham
1137     ham
2087     ham
169      ham
4353    spam
4527     ham
4829     ham
1650     ham
871      ham
1032     ham
Name: label, dtype: object

Convert data into csv files

In [90]:
x_train.to_csv('/content/x_train.csv', index= False, header= True)
x_test.to_csv('/content/x_test.csv', index= False, header= True)
x_train.to_csv('/content/y_train.csv', index= False, header= True)
x_test.to_csv('/content/y_test.csv', index= False, header= True)

## 1. Tfidf Model

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
x_train = pd.read_csv('/content/x_train.csv')
x_test = pd.read_csv('/content/x_test.csv')
x_train = pd.read_csv('/content/y_train.csv')
x_test = pd.read_csv('/content/y_test.csv')

In [92]:
x_train.head()

Unnamed: 0,clean_text
0,"['sitting', 'ard', 'nothing', 'lor', 'u', 'leh..."
1,"['lol', 'busy']"
2,"['alright', 'babe']"
3,"['yes', 'completely', 'formclark', 'also', 'ut..."
4,"['important', 'information', '4', 'orange', 'u..."


In [93]:
y_test.head()

1521     ham
4656    spam
2579     ham
4107     ham
1        ham
Name: label, dtype: object

Create Tfidf Vectors

In [94]:
tfidf_vect= TfidfVectorizer()
tfidf_vect.fit(x_train['clean_text'])

x_train_vect= tfidf_vect.transform(x_train['clean_text'])
x_test_vect= tfidf_vect.transform(x_test['clean_text'])

what words did the model learn

In [95]:
tfidf_vect.vocabulary_

{'sitting': 6539,
 'ard': 1092,
 'nothing': 5113,
 'lor': 4429,
 'leh': 4289,
 'busy': 1623,
 'work': 7993,
 'lol': 4410,
 'alright': 969,
 'babe': 1228,
 'yes': 8150,
 'completely': 2023,
 'formclark': 3079,
 'also': 973,
 'utter': 7624,
 'waste': 7792,
 'important': 3793,
 'information': 3840,
 'orange': 5275,
 'user': 7611,
 '0789xxxxxxx': 38,
 'today': 7320,
 'lucky': 4488,
 'day2find': 2284,
 'log': 4400,
 'onto': 5247,
 'httpwwwurawinnercom': 3698,
 'theres': 7212,
 'fantastic': 2899,
 'surprise': 7004,
 'awaiting': 1207,
 'missed': 4767,
 'havent': 3502,
 'much': 4895,
 'bit': 1408,
 'bored': 1481,
 'holiday': 3620,
 'want': 7776,
 'go': 3288,
 'bak': 1250,
 'college': 1981,
 'sad': 6214,
 'isnt': 3923,
 'itxx': 3945,
 'word': 7988,
 'checkmate': 1835,
 'chess': 1852,
 'comes': 1996,
 'persian': 5462,
 'phrase': 5500,
 'shah': 6402,
 'maat': 4521,
 'means': 4647,
 'king': 4133,
 'dead': 2295,
 'goodmorning': 3326,
 'good': 3318,
 'day': 2282,
 'dont': 2526,
 'file': 2962,
 'bagi

Build the model

In [96]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_model = rf.fit(x_train_vect , y_train.values.ravel())

Prediction

In [97]:
y_pred = rf_model.predict(x_test_vect)

Evaluate the Model

In [98]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
#prcision score= TPR=TP/TP+FP 
precision= precision_score(y_test, y_pred, pos_label='spam')
#Recall score= TP/TP+FN
recall = recall_score(y_test, y_pred, pos_label='spam')
#Accuracy score
accuracy= accuracy_score(y_test, y_pred)

print('Precision: {} / Recall: {} '.format(round(precision, 3), round(recall, 3)))
print('Accuracy: {} ',accuracy)

Precision: 1.0 / Recall: 0.755 
Accuracy: {}  0.9659192825112107


Here We can observe that,
Evaluation metrics are good for TFIDF Model
High precision, Recall(sensitivity), Accuracy
We will compare these evaluation metrics with other methods

##2. Word2Vec Model

In [99]:
import gensim

Aggregate Sentence Vectors generation

In [100]:
#adjecent words vectors taken into consideration for Sentence vector
w2v_model= gensim.models.Word2Vec(x_train, size= 100, window= 5, min_count=2)
#indexing of words
words= set(w2v_model.wv.index2word)

#nested arrays to generate aggregate sentence vectors
x_train_vect= np.array([np.array([w2v_model.wv[i]
                                  for i in ls if i in words])
                                  for ls in x_train['clean_text']
                        ])
x_test_vect= np.array([np.array([w2v_model.wv[i]
                                  for i in ls if i in words])
                                  for ls in x_test['clean_text']
                        ])

  if __name__ == '__main__':
  del sys.path[0]


single vector representation using average

In [101]:
#Standardizing size by zero padding
x_train_vect_avg = []
for v in x_train_vect:
  if v.size:
    x_train_vect_avg.append(v.mean(axis =0 ))
  else:
    x_train_vect_avg.append(np.zeros(100))

x_test_vect_avg = []
for v in x_test_vect:
  if v.size:
    x_test_vect_avg.append(v.mean(axis =0 ))
  else:
    x_test_vect_avg.append(np.zeros(100))

In [102]:
x_test_vect_avg[0]

array([-5.2561762e-04, -2.8732978e-03, -2.9415300e-03, -1.3200998e-03,
        2.6716024e-03,  9.6831827e-05, -2.7942113e-03, -3.3689296e-04,
       -4.3036877e-03, -5.2125589e-04, -1.1373855e-03,  4.4006370e-03,
       -3.0622205e-03,  3.9813989e-03,  1.4544857e-03,  2.5494904e-03,
       -2.3509183e-03,  2.4994656e-03,  4.4527734e-03, -3.5284190e-03,
        4.2209565e-03,  6.7420478e-05, -3.7061112e-04, -4.0048077e-03,
       -4.4059162e-03, -3.8443403e-03,  1.9727789e-03, -2.1678719e-03,
       -4.2226436e-03, -4.3032719e-03, -1.1109781e-03, -8.7673467e-04,
       -2.9700145e-03,  2.9724136e-03, -2.1802271e-03,  4.8593832e-03,
        1.1444227e-03,  4.4962107e-03, -4.7983156e-04,  9.3861209e-04,
        3.0434700e-03, -1.3340096e-03, -3.6902823e-03, -4.0204902e-03,
        4.6026791e-03,  2.4487872e-03, -1.2824469e-03,  4.6696221e-03,
       -2.9283261e-03,  2.9367753e-03,  1.7404304e-03,  9.7386225e-04,
       -1.6268203e-03,  3.1263835e-03, -1.4661117e-03, -4.2743795e-03,
      

In [103]:
x_test_vect_avg[11]

array([-2.4996727e-04, -3.1587058e-03, -1.4108357e-03, -1.5568475e-03,
        3.4759892e-03, -1.3859653e-03, -3.5098009e-03,  5.8656780e-04,
       -4.3261321e-03,  5.3195271e-04, -2.3323556e-03,  1.4651226e-03,
       -2.0924346e-03,  1.1815993e-03,  8.7951316e-04,  1.2310176e-03,
       -9.3538329e-05,  2.8615075e-03,  3.1888175e-03, -2.5729497e-03,
        4.4103418e-03,  1.7470779e-03,  1.5151601e-03, -2.5439220e-03,
       -4.1349172e-03, -3.5260892e-03, -5.6497316e-04, -3.2278565e-03,
       -1.8783441e-03, -1.5845599e-03, -5.0080247e-04, -2.1429942e-03,
       -7.1880489e-04,  1.6039378e-03, -1.9780784e-03,  4.0734746e-03,
        2.3646776e-03,  4.5738220e-03, -1.8841518e-03, -7.3259478e-05,
        2.9026861e-03,  4.1240011e-05, -3.8783313e-03, -6.0116383e-04,
        4.5821862e-03,  1.3801978e-03,  3.9787039e-05,  4.1932142e-03,
       -3.2550921e-03,  2.8354302e-03,  2.3409287e-03,  2.0180787e-03,
       -1.8917179e-03,  1.6544147e-03, -2.5075637e-03, -7.4283232e-04,
      

Fit the Model

In [104]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_model= rf.fit(x_train_vect_avg, y_train.values.ravel())

Prediction

In [105]:
y_pred= rf_model.predict(x_test_vect_avg)
y_pred


array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

Evaluate the performance of Model

In [106]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
#prcision score= TPR=TP/TP+FP 
precision= precision_score(y_test, y_pred, pos_label='spam')
#Recall score= TP/TP+FN
recall = recall_score(y_test, y_pred, pos_label='spam')
#Accuracy score
accuracy= accuracy_score(y_test, y_pred)

print('Precision: {} / Recall: {} '.format(round(precision, 3), round(recall, 3)))
print('Accuracy:  ',accuracy)

Precision: 0.5 / Recall: 0.213 
Accuracy:   0.8609865470852018


Here We can clearly observe that it is a poor model
with low performance metrics

##3. Doc2Vec Model

Tagged docs required for unique representation

In [107]:
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i,v in enumerate(x_train['clean_text']) ]

tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i,v in enumerate(x_test['clean_text']) ]

v represents the word and i as its unique index 

Build the Model

In [108]:
d2v_model= gensim.models.Doc2Vec( tagged_docs_train,
                                 vector_size=100, window=5, min_count=2 )



Infer vector method to convert list of words into Numeric vector Representation

In [109]:
train_vector = [d2v_model.infer_vector(eval(v.words))
                for v in tagged_docs_train]
test_vector = [d2v_model.infer_vector(eval(v.words))
                for v in tagged_docs_test]

eval function extracts list of words from string

Fit the model

In [110]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_model= rf.fit(train_vector, y_train.values.ravel())
#predict
y_pred= rf_model.predict(test_vector)
y_pred


array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

Evaluation of model

In [111]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
#prcision score= TPR=TP/TP+FP 
precision= precision_score(y_test, y_pred, pos_label='spam')
#Recall score= TP/TP+FN
recall = recall_score(y_test, y_pred, pos_label='spam')
#Accuracy score
accuracy= accuracy_score(y_test, y_pred)

print('Precision: {} / Recall: {} '.format(round(precision, 3), round(recall, 3)))
print('Accuracy:  ',accuracy)

Precision: 0.754 / Recall: 0.316 
Accuracy:   0.8905829596412556


Here we can observe that performance is slighly better than word2vec but not upto the level of tfidf.
we are able to overcome some drawbacks of word2vec 

##4. RNN Model

In [130]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Initialize and Fit

In [131]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts(x_train)

In [132]:
x_train_seq= tokenizer.texts_to_sequences(x_train)
x_test_seq= tokenizer.texts_to_sequences(x_test)

In [133]:
x_test_seq[0]

[49, 2, 1473, 560, 97, 185, 14]

Padding for equal size


In [134]:
x_train_seq_padded= pad_sequences(x_train_seq, 50)
x_test_seq_padded= pad_sequences(x_test_seq, 50)

In [135]:
x_train_seq_padded[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       3821,   70,   10,   46,  264,  119], dtype=int32)

Build the model

In [136]:
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

In [137]:
model= Sequential()
model.add(Embedding(len(tokenizer.index_word)+1,  32 ))
model.add(LSTM(32  , dropout= 0,  recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1 ,  activation='sigmoid'))

In [138]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          254048    
                                                                 
 lstm_4 (LSTM)               (None, 32)                8320      
                                                                 
 dense_8 (Dense)             (None, 32)                1056      
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 263,457
Trainable params: 263,457
Non-trainable params: 0
_________________________________________________________________


Compile the Model

In [139]:
import tensorflow as tf

In [140]:
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss= tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(),
                       tf.keras.metrics.FalseNegatives()])

In [141]:
history=  model.fit(x_train_seq_padded, y_train,
                    batch_size=32, epochs=10,
                    validation_data=(x_test_seq_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


If we see values of accuracy it beats the tfidf model
so it is the best model but highly complex

model selection depends on problem adn cost of differnt types of error