In [1]:
import pandas as pd
import gensim
from sklearn.ensemble import RandomForestClassifier

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
import numpy as np

In [2]:
#importing the dataset

messages = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['Output/Label','Message'])
messages

Unnamed: 0,Output/Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
messages.shape

(5572, 2)

In [4]:
messages['Message'].loc[10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [5]:
messages['Message'].iloc[10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [6]:
## Text Preprocessing 
#nltk.download('stopwords')

In [7]:
stemming = PorterStemmer()

corpus = []
for i in range(len(messages)):
    clean_sentence = re.sub('[^a-zA-Z0-9]',' ',messages['Message'][i])
    clean_sentence = clean_sentence.lower()
    clean_sentence = clean_sentence.split()
    
    clean_sentence = [stemming.stem(word) for word in clean_sentence if not word in stopwords.words('english')]
    clean_sentence = ' '.join(clean_sentence)
    corpus.append(clean_sentence)

In [8]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

## Bag of Words - Follow data leakage practices and any classifier algorithm can be used

In [9]:
cv = CountVectorizer(binary=True, max_features=3000,ngram_range=(2,3))

x = cv.fit_transform(corpus).toarray()

In [10]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
x.shape

(5572, 3000)

In [12]:
y = pd.get_dummies(messages['Output/Label'])
y = y.iloc[:,1].values

#y.iloc[:,1].values
y.shape

(5572,)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.20, random_state=42)

print(x_train.shape)
print(x_test.shape)

(4457, 3000)
(1115, 3000)


In [14]:
spam_detection = MultinomialNB()

spam_detection.fit(x_train,y_train)
y_pred =  spam_detection.predict(x_test)

print(accuracy_score(y_pred,y_test))
print()
print(confusion_matrix(y_pred, y_test))
print()
print(classification_report(y_pred, y_test))

0.9695067264573991

[[966  34]
 [  0 115]]

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1000
           1       0.77      1.00      0.87       115

    accuracy                           0.97      1115
   macro avg       0.89      0.98      0.93      1115
weighted avg       0.98      0.97      0.97      1115



In [15]:
lr = LogisticRegression()

lr.fit(x_train,y_train)
lr_pred = lr.predict(x_test)

print(accuracy_score(lr_pred,y_test))
print()
print(confusion_matrix(lr_pred, y_test))
print()
print(classification_report(lr_pred, y_test))

0.95695067264574

[[966  48]
 [  0 101]]

              precision    recall  f1-score   support

           0       1.00      0.95      0.98      1014
           1       0.68      1.00      0.81       101

    accuracy                           0.96      1115
   macro avg       0.84      0.98      0.89      1115
weighted avg       0.97      0.96      0.96      1115



## TF-IDF

In [16]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
x_tfidf = tfidf.fit_transform(corpus).toarray()
x_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
x_tfidf.shape

(5572, 3000)

In [18]:
y.shape

(5572,)

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x_tfidf,y, test_size=0.20, random_state=42)

In [20]:
spam_using_tfidf = MultinomialNB()
spam_using_tfidf.fit(x_train,y_train)
y_pred = spam_using_tfidf.predict(x_test)

print(accuracy_score(y_pred,y_test))
print()
print(confusion_matrix(y_pred,y_test))
print()
print(classification_report(y_pred,y_test))

0.9820627802690582

[[966  20]
 [  0 129]]

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       986
           1       0.87      1.00      0.93       129

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## Word2Vec 

In [21]:
#!pip install gensim

### Pre-trained model with 300 dimensions

### Building a word2vec model from scratch

In [22]:
lemmatizer = WordNetLemmatizer()

lemmatizedcorpus = []
for i in range(len(messages)):
    clean_sentence = re.sub('[^a-zA-Z0-9]',' ',messages['Message'][i])
    clean_sentence = clean_sentence.lower()
    clean_sentence = clean_sentence.split()
    
    clean_sentence = [lemmatizer.lemmatize(word) for word in clean_sentence if not word in stopwords.words('english')]
    clean_sentence = ' '.join(clean_sentence)
    lemmatizedcorpus.append(clean_sentence)

In [23]:
[[i,j,k] for i,j,k in zip(list(map(len,lemmatizedcorpus)),lemmatizedcorpus, messages['Message']) if i<1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

In [24]:
len(lemmatizedcorpus)

5572

In [25]:
words = []
for doc in lemmatizedcorpus:
    doc_token = sent_tokenize(doc) #converts corpus into sentence/document
    for sent in doc_token:
        #words.append(nltk.word_tokenize(sent))
        words.append(simple_preprocess(sent)) #Converts each document into words in lower case

In [26]:
len(words)

5565

In [27]:
len(words[1])

5

In [28]:
own_model = gensim.models.Word2Vec(words, vector_size=100, window=5, min_count=2)

In [29]:
own_model.wv.index_to_key ##created our own word 2 vec in 100 dimensions

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'make',
 'dear',
 'night',
 'message',
 'well',
 'say',
 'min',
 'thing',
 'much',
 'great',
 'claim',
 'hope',
 'oh',
 'hey',
 'give',
 'number',
 'happy',
 'friend',
 'work',
 'wat',
 'way',
 'yes',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'win',
 'amp',
 'life',
 'cash',
 'yeah',
 'im',
 'tone',
 'really',
 'babe',
 'meet',
 'find',
 'miss',
 'morning',
 'uk',
 'last',
 'service',
 'thanks',
 'care',
 'com',
 'would',
 'anything',
 'year',
 'also',
 'lol',
 'nokia',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'urgent',
 'contact',
 'sent',


In [30]:
own_model.wv['need'].shape

(100,)

In [31]:
own_model.wv.most_similar('need')

[('day', 0.9998332858085632),
 ('one', 0.9998080134391785),
 ('see', 0.9997929334640503),
 ('tell', 0.9997909665107727),
 ('today', 0.9997825026512146),
 ('make', 0.9997751712799072),
 ('think', 0.9997750520706177),
 ('got', 0.9997727870941162),
 ('like', 0.999770998954773),
 ('much', 0.9997702836990356)]

In [32]:
#own_model.wv.most_similar('apple') ##Out of vocabulary problem when we set min_count is 2 (i.e) word used atleast 2 times

In [33]:
#own_model.wv.most_similar('going')
#own_model.wv.most_similar('king')
#own_model.wv.most_similar('apple')

In [34]:
own_model.wv.similarity('happy','better')

0.99922895

In [35]:
## Check for out of vocabulary if word not exists return zero array of dimension 100 as vector_dim is 100
def avgWord2Vec(doc):
    return np.mean([own_model.wv[word] for word in doc if word in own_model.wv.index_to_key]
                or [np.zeros(100)], axis=0)

In [36]:
x = []
for i in range(len(words)):
    x.append(avgWord2Vec(words[i]))

In [37]:
x_word2vec = np.array(x)
x_word2vec = np.stack(x, axis=0)

x_word2vec.shape

(5565, 100)

In [38]:
y_word2vec = messages[list(map(lambda x: len(x)>0 ,lemmatizedcorpus))]
y_word2vec = pd.get_dummies(y_word2vec['Output/Label'])
y_word2vec = y_word2vec.iloc[:,1].values

y_word2vec.shape

(5565,)

In [39]:
x_train,x_test,y_train,y_test = train_test_split(x_word2vec,y_word2vec, test_size=0.20, random_state=42)

print(x_train.shape)
print(x_test.shape)

(4452, 100)
(1113, 100)


In [40]:
lr_word2vec = LogisticRegression()
lr_word2vec.fit(x_train,y_train)
y_pred = lr_word2vec.predict(x_test)

print(accuracy_score(y_pred, y_test))
print()
print(classification_report(y_pred,y_test,zero_division='warn'))
print()
print(confusion_matrix(y_pred,y_test))

0.8544474393530997

              precision    recall  f1-score   support

           0       1.00      0.85      0.92      1113
           1       0.00      0.00      0.00         0

    accuracy                           0.85      1113
   macro avg       0.50      0.43      0.46      1113
weighted avg       1.00      0.85      0.92      1113


[[951 162]
 [  0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

print(accuracy_score(y_pred, y_test))
print()
print(classification_report(y_pred,y_test,zero_division='warn'))
print()
print(confusion_matrix(y_pred,y_test))

0.9505840071877808

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       984
           1       0.73      0.91      0.81       129

    accuracy                           0.95      1113
   macro avg       0.86      0.94      0.89      1113
weighted avg       0.96      0.95      0.95      1113


[[940  44]
 [ 11 118]]
