In [1]:
!pip install gensim



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.tokenize import sent_tokenize
import gensim
from gensim.utils import simple_preprocess ## Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
from gensim.models import Word2Vec, KeyedVectors

In [3]:
messages=pd.read_csv('SMSSpamCollection.txt',
                     sep='\t', names=['label','message'])

In [4]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
corpus=[]
for i in range(len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review]
  review=' '.join(review)
  corpus.append(review)

In [7]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [8]:
list(map(len,corpus))

[102,
 23,
 126,
 43,
 57,
 135,
 74,
 147,
 126,
 135,
 106,
 99,
 128,
 191,
 33,
 142,
 22,
 74,
 54,
 121,
 40,
 44,
 44,
 83,
 55,
 137,
 29,
 125,
 72,
 59,
 128,
 175,
 28,
 83,
 154,
 111,
 46,
 28,
 21,
 150,
 78,
 121,
 137,
 18,
 59,
 27,
 44,
 29,
 66,
 142,
 56,
 110,
 79,
 278,
 112,
 70,
 121,
 32,
 22,
 38,
 102,
 44,
 24,
 53,
 108,
 130,
 114,
 124,
 73,
 33,
 37,
 28,
 44,
 39,
 17,
 41,
 71,
 46,
 35,
 72,
 21,
 29,
 32,
 36,
 13,
 50,
 115,
 139,
 42,
 37,
 55,
 187,
 136,
 106,
 100,
 129,
 31,
 48,
 168,
 31,
 55,
 75,
 73,
 147,
 176,
 44,
 91,
 42,
 82,
 107,
 28,
 39,
 30,
 88,
 124,
 131,
 138,
 116,
 67,
 82,
 100,
 121,
 50,
 125,
 47,
 33,
 18,
 237,
 21,
 103,
 24,
 8,
 37,
 24,
 110,
 141,
 37,
 33,
 42,
 143,
 71,
 32,
 25,
 141,
 40,
 25,
 46,
 140,
 83,
 31,
 27,
 43,
 53,
 19,
 147,
 363,
 27,
 24,
 149,
 109,
 115,
 115,
 59,
 82,
 128,
 148,
 48,
 127,
 72,
 63,
 55,
 48,
 78,
 31,
 95,
 56,
 65,
 25,
 129,
 65,
 122,
 151,
 21,
 63,
 21,
 26,
 139

In [9]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [10]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
words=[]
for sent in corpus:
  sent_token=sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [12]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [13]:
words[0] # 1st sentence

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [14]:
## Train the Word2Vec model from scratch
model=gensim.models.Word2Vec(words)

In [15]:
# Vocabulary
model.corpus_count

5569

In [16]:
model.epochs

5

In [17]:
model.wv.most_similar('world')

[('never', 0.9992136359214783),
 ('amp', 0.9991951584815979),
 ('had', 0.9991921782493591),
 ('by', 0.9991845488548279),
 ('over', 0.9991813898086548),
 ('yet', 0.9991572499275208),
 ('getting', 0.9991440176963806),
 ('down', 0.999139666557312),
 ('girl', 0.9991241693496704),
 ('which', 0.9991225600242615)]

In [18]:
model.wv['good']

array([-0.34894282,  0.299241  ,  0.1196036 ,  0.07718075,  0.09345647,
       -0.70633686,  0.22739036,  0.7266506 , -0.34187594, -0.18133509,
       -0.26135492, -0.5510484 , -0.13707422,  0.10319717,  0.26942685,
       -0.21018896,  0.16124034, -0.45837575, -0.0899251 , -0.7120234 ,
        0.2934874 ,  0.06464997,  0.17331852, -0.2578358 , -0.04527071,
       -0.01128819, -0.27659392, -0.30133727, -0.39540735,  0.08310325,
        0.51314044, -0.04308658,  0.19220471, -0.38930485, -0.14865331,
        0.5523856 , -0.00408594, -0.21952116, -0.15295003, -0.65883696,
        0.13508272, -0.33168656, -0.24146537, -0.06770984,  0.24477147,
       -0.03465935, -0.09834577, -0.07400943,  0.31891122,  0.21628092,
        0.3234672 , -0.30155247, -0.002013  ,  0.05000332, -0.06969719,
        0.0924336 ,  0.28499612,  0.19426943, -0.52246803,  0.18271236,
        0.04840532,  0.12413558, -0.11996832, -0.17489283, -0.3840359 ,
        0.340499  ,  0.10267816,  0.36459318, -0.3962179 ,  0.54

In [19]:
model.wv['good'].shape

(100,)

In [20]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [21]:
!pip install tqdm



In [22]:
from tqdm import tqdm

In [23]:
# Apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:00<00:00, 6817.12it/s]


In [24]:
X

[array([-0.23919512,  0.20348208,  0.09119185,  0.04795792,  0.08689937,
        -0.5200214 ,  0.14243631,  0.50269145, -0.23072228, -0.14026545,
        -0.20291634, -0.3809953 , -0.12987062,  0.07533947,  0.19698463,
        -0.14184806,  0.09842372, -0.34135368, -0.07104743, -0.4996876 ,
         0.17455989,  0.03965029,  0.10693006, -0.19125645, -0.02725562,
        -0.0025073 , -0.20832586, -0.19867164, -0.2806635 ,  0.05994445,
         0.34518683, -0.02478959,  0.10655871, -0.26232496, -0.11444373,
         0.3805761 ,  0.01759821, -0.16508165, -0.13239092, -0.45861015,
         0.09175751, -0.23245531, -0.18282609, -0.0527661 ,  0.15764062,
        -0.03325105, -0.05518072, -0.04376438,  0.23810144,  0.15082358,
         0.20180817, -0.22802147,  0.01757083,  0.06830297, -0.09201391,
         0.0734768 ,  0.18867612,  0.11663385, -0.37697178,  0.13957842,
         0.04613022,  0.09021603, -0.07429846, -0.12121893, -0.26788574,
         0.23778728,  0.06207119,  0.2275971 , -0.2

In [25]:
len(X)

5569

In [26]:
X_new = np.array(X, dtype=object)

In [27]:
X_new[0]

array([-0.23919512,  0.20348208,  0.09119185,  0.04795792,  0.08689937,
       -0.5200214 ,  0.14243631,  0.50269145, -0.23072228, -0.14026545,
       -0.20291634, -0.3809953 , -0.12987062,  0.07533947,  0.19698463,
       -0.14184806,  0.09842372, -0.34135368, -0.07104743, -0.4996876 ,
        0.17455989,  0.03965029,  0.10693006, -0.19125645, -0.02725562,
       -0.0025073 , -0.20832586, -0.19867164, -0.2806635 ,  0.05994445,
        0.34518683, -0.02478959,  0.10655871, -0.26232496, -0.11444373,
        0.3805761 ,  0.01759821, -0.16508165, -0.13239092, -0.45861015,
        0.09175751, -0.23245531, -0.18282609, -0.0527661 ,  0.15764062,
       -0.03325105, -0.05518072, -0.04376438,  0.23810144,  0.15082358,
        0.20180817, -0.22802147,  0.01757083,  0.06830297, -0.09201391,
        0.0734768 ,  0.18867612,  0.11663385, -0.37697178,  0.13957842,
        0.04613022,  0.09021603, -0.07429846, -0.12121893, -0.26788574,
        0.23778728,  0.06207119,  0.2275971 , -0.29595578,  0.37

In [28]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [29]:
messages.shape

(5572, 2)

In [30]:
X_new.shape  ## 3 records have been lost from the input features

(5569,)

In [31]:
X_new[0].shape

(100,)

In [32]:
type(X_new)

numpy.ndarray

In [33]:
## Dependent/Output feature
y=messages[list(map(lambda x:len(x)>0, corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [34]:
y.shape

(5569,)

In [35]:
X[0]

array([-0.23919512,  0.20348208,  0.09119185,  0.04795792,  0.08689937,
       -0.5200214 ,  0.14243631,  0.50269145, -0.23072228, -0.14026545,
       -0.20291634, -0.3809953 , -0.12987062,  0.07533947,  0.19698463,
       -0.14184806,  0.09842372, -0.34135368, -0.07104743, -0.4996876 ,
        0.17455989,  0.03965029,  0.10693006, -0.19125645, -0.02725562,
       -0.0025073 , -0.20832586, -0.19867164, -0.2806635 ,  0.05994445,
        0.34518683, -0.02478959,  0.10655871, -0.26232496, -0.11444373,
        0.3805761 ,  0.01759821, -0.16508165, -0.13239092, -0.45861015,
        0.09175751, -0.23245531, -0.18282609, -0.0527661 ,  0.15764062,
       -0.03325105, -0.05518072, -0.04376438,  0.23810144,  0.15082358,
        0.20180817, -0.22802147,  0.01757083,  0.06830297, -0.09201391,
        0.0734768 ,  0.18867612,  0.11663385, -0.37697178,  0.13957842,
        0.04613022,  0.09021603, -0.07429846, -0.12121893, -0.26788574,
        0.23778728,  0.06207119,  0.2275971 , -0.29595578,  0.37

In [36]:
X[0].shape

(100,)

In [37]:
X[0].reshape(1,-1).shape

(1, 100)

In [38]:
## Independent features
df_list = []
for i in range(len(X)):
  df_list.append(pd.DataFrame(X[i].reshape(1, -1)))

df = pd.concat(df_list, ignore_index=True)

  df = pd.concat(df_list, ignore_index=True)


In [39]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.239195,0.203482,0.091192,0.047958,0.086899,-0.520021,0.142436,0.502691,-0.230722,-0.140265,...,0.392941,0.107642,0.054417,0.008733,0.474992,0.185096,0.134888,-0.220211,0.177349,-0.042446
1,-0.214726,0.177011,0.077127,0.042913,0.081419,-0.450395,0.112608,0.439295,-0.200618,-0.115135,...,0.346632,0.087972,0.042311,0.003456,0.403153,0.155070,0.116553,-0.199958,0.159911,-0.044031
2,-0.254597,0.216946,0.100761,0.055606,0.077313,-0.560148,0.135035,0.501376,-0.240867,-0.167298,...,0.392342,0.110795,0.041793,-0.015894,0.489252,0.177169,0.089996,-0.256452,0.201467,-0.020633
3,-0.328823,0.273405,0.116878,0.070827,0.117196,-0.699452,0.187191,0.682879,-0.315763,-0.180409,...,0.534184,0.141389,0.071522,0.021865,0.632903,0.254666,0.198277,-0.301433,0.241204,-0.067398
4,-0.283984,0.223209,0.105894,0.057700,0.106003,-0.584880,0.156553,0.577089,-0.268652,-0.157837,...,0.452419,0.114217,0.060885,0.020313,0.534885,0.213656,0.161859,-0.263678,0.197683,-0.054064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5564,-0.287221,0.255976,0.126810,0.062199,0.096400,-0.632097,0.171579,0.590386,-0.280166,-0.188780,...,0.465636,0.131813,0.065843,-0.008524,0.572504,0.208266,0.121739,-0.279236,0.220491,-0.033270
5565,-0.298212,0.237293,0.106956,0.068083,0.105968,-0.623125,0.164581,0.603189,-0.291605,-0.165671,...,0.477910,0.128834,0.045639,0.008572,0.565797,0.219859,0.147964,-0.283574,0.216375,-0.041440
5566,-0.328608,0.276792,0.118117,0.053924,0.105569,-0.691907,0.193781,0.677802,-0.310707,-0.194736,...,0.530842,0.152914,0.081969,0.025122,0.634721,0.255983,0.195614,-0.287424,0.231749,-0.062655
5567,-0.292931,0.245227,0.109934,0.056871,0.098631,-0.626835,0.167880,0.605791,-0.279182,-0.178393,...,0.469547,0.130497,0.071209,0.015648,0.570218,0.225875,0.158351,-0.267820,0.208388,-0.054992


In [40]:
df.shape

(5569, 100)

In [41]:
df['Output']=y

In [42]:
## Independent Feature
X=df

In [43]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.239195,0.203482,0.091192,0.047958,0.086899,-0.520021,0.142436,0.502691,-0.230722,-0.140265,...,0.107642,0.054417,0.008733,0.474992,0.185096,0.134888,-0.220211,0.177349,-0.042446,True
1,-0.214726,0.177011,0.077127,0.042913,0.081419,-0.450395,0.112608,0.439295,-0.200618,-0.115135,...,0.087972,0.042311,0.003456,0.403153,0.15507,0.116553,-0.199958,0.159911,-0.044031,True
2,-0.254597,0.216946,0.100761,0.055606,0.077313,-0.560148,0.135035,0.501376,-0.240867,-0.167298,...,0.110795,0.041793,-0.015894,0.489252,0.177169,0.089996,-0.256452,0.201467,-0.020633,False
3,-0.328823,0.273405,0.116878,0.070827,0.117196,-0.699452,0.187191,0.682879,-0.315763,-0.180409,...,0.141389,0.071522,0.021865,0.632903,0.254666,0.198277,-0.301433,0.241204,-0.067398,True
4,-0.283984,0.223209,0.105894,0.0577,0.106003,-0.58488,0.156553,0.577089,-0.268652,-0.157837,...,0.114217,0.060885,0.020313,0.534885,0.213656,0.161859,-0.263678,0.197683,-0.054064,True


In [44]:
X.isna().sum()

Unnamed: 0,0
0,12
1,12
2,12
3,12
4,12
...,...
96,12
97,12
98,12
99,12


In [45]:
df.dropna(inplace=True)

In [46]:
df.isna().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
96,0
97,0
98,0
99,0


In [47]:
## Final independent and dependent features
X=df.drop('Output', axis=1)
y=df['Output']

In [48]:
## Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
1394,-0.319652,0.262969,0.118784,0.056411,0.120018,-0.668764,0.17996,0.661596,-0.301951,-0.177768,...,0.519658,0.139679,0.077949,0.023908,0.604394,0.248179,0.18402,-0.293696,0.234592,-0.072385
2637,-0.253211,0.204676,0.095188,0.04593,0.093451,-0.527321,0.139653,0.51737,-0.236798,-0.150561,...,0.405834,0.107766,0.058106,0.012606,0.487058,0.193929,0.137567,-0.229529,0.183045,-0.041106
2578,-0.258112,0.217022,0.095075,0.040108,0.095243,-0.545241,0.145781,0.53044,-0.245746,-0.152834,...,0.404979,0.12239,0.069705,0.012746,0.504102,0.206321,0.148553,-0.225073,0.188484,-0.047406
1257,-0.245025,0.205201,0.095714,0.049168,0.081096,-0.521043,0.140673,0.513276,-0.237789,-0.14184,...,0.397176,0.109784,0.055226,0.011822,0.476184,0.190117,0.130808,-0.221673,0.177554,-0.044069
4235,-0.296286,0.251574,0.114522,0.061226,0.09712,-0.651348,0.157888,0.582515,-0.278098,-0.187953,...,0.455074,0.128539,0.054455,-0.009305,0.570873,0.211243,0.118633,-0.283598,0.227849,-0.030697


In [50]:
y_train.head()

Unnamed: 0,Output
1394,True
2637,True
2578,True
1257,True
4235,False


In [51]:
## Train the model
classifier=RandomForestClassifier()
classifier.fit(X_train, y_train)

In [52]:
## Prediction
y_pred=classifier.predict(X_test)

In [53]:
## Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 96.49%
              precision    recall  f1-score   support

       False       0.89      0.84      0.86       148
        True       0.98      0.98      0.98       964

    accuracy                           0.96      1112
   macro avg       0.93      0.91      0.92      1112
weighted avg       0.96      0.96      0.96      1112

