### Here we are going to solve the spam-ham classification problem, by training an entire word2vec model from scratch.

In [4]:
import gensim
from gensim.models import KeyedVectors,Word2Vec
import gensim.downloader as api

path="D:\Google-W2V\GoogleNews-vectors-negative300.bin.gz"

w2v=KeyedVectors.load_word2vec_format(path,limit=20000,binary=True)

In [5]:
# test the w2v
vec_king=w2v['king']
print(vec_king)

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [6]:
# read the dataset
import pandas as pd

messages_df=pd.read_csv("E:\ml-bootcamp\section-51\Spam-Ham Project\SMSSpamCollection.csv",sep="\t",names=["result","text"])

In [7]:
messages_df.head()

Unnamed: 0,result,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# now do data cleaning and text preprocessing
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer=WordNetLemmatizer()

In [113]:
# perform all steps of data cleaning and preprocessing

messages=messages_df['text'].to_list()
corpus=[]

for i in range(len(messages)):
    # take each sentence
    sentence=messages[i]
    # remove all non english characters
    sentence=re.sub(pattern='[^a-zA-Z]',repl=' ',string=sentence)
    # convert the sentences to lower string
    sentence=sentence.lower()
    # take all the words into list
    words=sentence.split()
    # and then lemmatize each word if they are not a stopword
    lemmatized_words=[lemmatizer.lemmatize(word,pos='v') for word in words]
    # then join it back to form a lemmatized sentence
    lemmatized_sentence=" ".join(lemmatized_words)
    # then append the lemmatized sentence
    corpus.append(lemmatized_sentence)

In [114]:
print(corpus)

['go until jurong point crazy available only in bugis n great world la e buffet cine there get amore wat', 'ok lar joke wif u oni', 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s', 'u dun say so early hor u c already then say', 'nah i don t think he go to usf he live around here though', 'freemsg hey there darling it s be week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv', 'even my brother be not like to speak with me they treat me like aid patent', 'as per your request melle melle oru minnaminunginte nurungu vettam have be set as your callertune for all callers press to copy your friends callertune', 'winner as a value network customer you have be select to receivea prize reward to claim call claim code kl valid hours only', 'have your mobile months or more u r entitle to update to the latest colour mobiles with camera for free call the mobile update co fr

In [115]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus,messages_df['text']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [116]:
# also you can apply sent_tokenize on each sentence in the corpus list, so that we get the tokenized words from each sentence
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

words=[]

for sentence in corpus:
    # tokenize each sentence
    tokenized_sentence=sent_tokenize(sentence)
    for token in tokenized_sentence:
        # append to the words list
        words.append(simple_preprocess(doc=token))

In [117]:
print(words)

[['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'there', 'get', 'amore', 'wat'], ['ok', 'lar', 'joke', 'wif', 'oni'], ['free', 'entry', 'in', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'to', 'to', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', 'over'], ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'], ['nah', 'don', 'think', 'he', 'go', 'to', 'usf', 'he', 'live', 'around', 'here', 'though'], ['freemsg', 'hey', 'there', 'darling', 'it', 'be', 'week', 'now', 'and', 'no', 'word', 'back', 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'to', 'send', 'to', 'rcv'], ['even', 'my', 'brother', 'be', 'not', 'like', 'to', 'speak', 'with', 'me', 'they', 'treat', 'me', 'like', 'aid', 'patent'], ['as', 'per', 'your', 'request', 'melle', 'melle', 'oru', 'minnaminunginte', 'nurungu', 'vettam', 'have', 'be', 'set'

In [118]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'get',
 'amore',
 'wat']

In [119]:
# now lets train this word2vec model from scratch
w2v_model=Word2Vec(sentences=words,vector_size=100,min_count=5)

In [120]:
# display all the available vocabularies
w2v_model.wv.index_to_key

['be',
 'to',
 'you',
 'the',
 'and',
 'in',
 'have',
 'me',
 'my',
 'it',
 'do',
 'for',
 'get',
 'call',
 'your',
 'of',
 'that',
 'go',
 'on',
 'now',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'at',
 'will',
 'ur',
 'if',
 'with',
 'no',
 'just',
 'this',
 'come',
 'gt',
 'lt',
 'how',
 'up',
 'when',
 'ok',
 'what',
 'know',
 'free',
 'from',
 'send',
 'all',
 'out',
 'll',
 'like',
 'time',
 'love',
 'want',
 'good',
 'day',
 'then',
 'there',
 'he',
 'say',
 'its',
 'only',
 'think',
 'tell',
 'take',
 'text',
 'need',
 'win',
 'see',
 'txt',
 'as',
 'make',
 'one',
 'by',
 'stop',
 'don',
 'home',
 'she',
 'about',
 'reply',
 'lor',
 'today',
 'sorry',
 'still',
 'back',
 'da',
 'dont',
 'our',
 'mobile',
 'phone',
 'please',
 'hi',
 'meet',
 'they',
 'new',
 'work',
 'miss',
 'later',
 'pls',
 'any',
 'her',
 'ask',
 'give',
 'some',
 'week',
 'dear',
 'message',
 'here',
 'wait',
 'who',
 'well',
 'where',
 'leave',
 're',
 'night',
 'try',
 'much',
 'hope',
 'an',
 'oh',


In [121]:
w2v_model.corpus_total_words

78568

In [122]:
w2v_model.epochs

5

In [123]:
w2v_model.wv.similar_by_word('boy')

[('bring', 0.999370813369751),
 ('lot', 0.9993609189987183),
 ('today', 0.9993091821670532),
 ('things', 0.9992930889129639),
 ('really', 0.9992867708206177),
 ('quite', 0.9992830753326416),
 ('hey', 0.9992788434028625),
 ('even', 0.9992755055427551),
 ('why', 0.9992694854736328),
 ('very', 0.9992554783821106)]

In [124]:
w2v_model.wv['good']

array([-0.132227  ,  0.19540215,  0.0515003 ,  0.07652427,  0.15802187,
       -0.4707406 ,  0.31913605,  0.94754845, -0.35720432, -0.3127631 ,
       -0.0572104 , -0.51825297, -0.05380532,  0.1704492 ,  0.270109  ,
       -0.15172823,  0.00528217, -0.43310106, -0.00353064, -0.68822014,
        0.16640478,  0.29590935,  0.37285304, -0.24720863, -0.04309875,
       -0.12748212, -0.27032623, -0.27878058, -0.33039865, -0.05991078,
        0.31053528, -0.12192066,  0.14648528, -0.29721045, -0.01032315,
        0.61399764,  0.2345337 , -0.22227763, -0.27105635, -0.7168535 ,
       -0.02522835, -0.41836688,  0.01880001,  0.03597564,  0.296579  ,
       -0.15176357, -0.3007181 , -0.04824509,  0.18421485,  0.1916081 ,
        0.23152281, -0.42644337, -0.0995866 , -0.04920533, -0.14276534,
        0.12104073,  0.24263254, -0.01010024, -0.43893474,  0.03928008,
        0.04848248,  0.02634832,  0.19065408, -0.14026344, -0.3605019 ,
        0.4171358 ,  0.20721625,  0.34461963, -0.5295693 ,  0.44

In [125]:
import numpy as np

In [126]:
# get the avgword2vec function

def avgword2vec(doc):
    # get all the available vocab words from the input sentence, and convert into vectors
    # word_vecs=[w2v_model.wv[word] for word in doc if word in w2v_model.wv.index_to_key]
    return np.mean([w2v_model.wv[word] for word in doc if word in w2v_model.wv.index_to_key],axis=0)

In [127]:
!pip install tqdm



In [128]:
from tqdm import tqdm

In [129]:
# now apply the avgword2vec
X=[]

for i in tqdm(range(len(words))):
    X.append(avgword2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:01<00:00, 3322.80it/s]


In [130]:
X

[array([-0.11137972,  0.15172201,  0.03194349,  0.0323384 ,  0.10558211,
        -0.34071267,  0.20589894,  0.63564855, -0.25664395, -0.21711189,
        -0.04355763, -0.36392263, -0.02267396,  0.11383478,  0.18514527,
        -0.11076735,  0.01333433, -0.3142584 ,  0.01689336, -0.5041823 ,
         0.11691621,  0.19408636,  0.26327896, -0.16186984, -0.03386281,
        -0.08204146, -0.19578572, -0.17666788, -0.21824162, -0.02032687,
         0.2275026 , -0.07524604,  0.08469046, -0.19481517, -0.0035578 ,
         0.41625577,  0.15591873, -0.15514284, -0.19813034, -0.51312405,
        -0.0278293 , -0.30140877,  0.01400649,  0.02681508,  0.20072079,
        -0.10838962, -0.18612602, -0.04584944,  0.11548809,  0.11738426,
         0.16916794, -0.3233798 , -0.07656103, -0.03899705, -0.10048992,
         0.11157966,  0.1844272 ,  0.00152356, -0.31495962,  0.03437221,
         0.0408765 , -0.00289631,  0.12826319, -0.08657298, -0.25049952,
         0.29088643,  0.15478173,  0.24135861, -0.3

In [131]:
len(X)

5569

In [155]:
X[0].reshape(1,-1).shape

(1, 100)

In [157]:
# get my independent features
df_input=pd.DataFrame()
for i in range(len(X)):
    df_input=pd.concat([df_input,pd.DataFrame(X[i].reshape(1,-1))],ignore_index=True)

  df_input=pd.concat([df_input,pd.DataFrame(X[i].reshape(1,-1))],ignore_index=True)


In [158]:
df_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.11138,0.151722,0.031943,0.032338,0.105582,-0.340713,0.205899,0.635649,-0.256644,-0.217112,...,0.339283,0.17656,0.02227,0.096329,0.472219,0.242477,0.162434,-0.222165,0.095673,-0.190529
1,-0.1155,0.150943,0.024531,0.035452,0.112385,-0.341311,0.206258,0.631147,-0.248539,-0.213411,...,0.346834,0.171621,0.02626,0.092799,0.468842,0.239351,0.163802,-0.221793,0.09433,-0.187356
2,-0.157749,0.174592,0.048162,0.01272,0.114886,-0.376066,0.203405,0.664408,-0.285644,-0.251401,...,0.358942,0.172249,-0.010944,0.113717,0.518888,0.225586,0.16823,-0.287473,0.084219,-0.186017
3,-0.134489,0.209469,0.034088,0.066408,0.158422,-0.463361,0.287876,0.876062,-0.345647,-0.28202,...,0.471739,0.243439,0.049764,0.119081,0.636151,0.341198,0.225714,-0.286406,0.138406,-0.271555
4,-0.118568,0.172606,0.03256,0.038928,0.14164,-0.394269,0.237132,0.738977,-0.298517,-0.247941,...,0.388934,0.203099,0.033289,0.10494,0.53922,0.286284,0.196726,-0.254545,0.115097,-0.226025


In [150]:
# get my dependent features
# all the docs having len>0
y=messages_df[list(map(lambda x: len(x)>0,corpus))]
y=pd.get_dummies(y['result'],dtype=int)
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [151]:
y=y.iloc[:,1].values

In [152]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [153]:
y.shape

(5569,)

In [160]:
# now lets get my independent and dependent features
X=df_input
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.11138,0.151722,0.031943,0.032338,0.105582,-0.340713,0.205899,0.635649,-0.256644,-0.217112,...,0.339283,0.17656,0.02227,0.096329,0.472219,0.242477,0.162434,-0.222165,0.095673,-0.190529
1,-0.1155,0.150943,0.024531,0.035452,0.112385,-0.341311,0.206258,0.631147,-0.248539,-0.213411,...,0.346834,0.171621,0.02626,0.092799,0.468842,0.239351,0.163802,-0.221793,0.09433,-0.187356
2,-0.157749,0.174592,0.048162,0.01272,0.114886,-0.376066,0.203405,0.664408,-0.285644,-0.251401,...,0.358942,0.172249,-0.010944,0.113717,0.518888,0.225586,0.16823,-0.287473,0.084219,-0.186017
3,-0.134489,0.209469,0.034088,0.066408,0.158422,-0.463361,0.287876,0.876062,-0.345647,-0.28202,...,0.471739,0.243439,0.049764,0.119081,0.636151,0.341198,0.225714,-0.286406,0.138406,-0.271555
4,-0.118568,0.172606,0.03256,0.038928,0.14164,-0.394269,0.237132,0.738977,-0.298517,-0.247941,...,0.388934,0.203099,0.033289,0.10494,0.53922,0.286284,0.196726,-0.254545,0.115097,-0.226025


In [161]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [162]:
# now perform train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [163]:
X_train.shape,X_test.shape

((4176, 100), (1393, 100))

In [164]:
# now apply a classifier
from sklearn.ensemble import RandomForestClassifier

rf_classifier=RandomForestClassifier(n_estimators=200,verbose=1,n_jobs=-1)

# train using random forest
rf_classifier.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.5s finished


In [165]:
y_pred=rf_classifier.predict(X_test)
y_pred_prob=rf_classifier.predict_proba(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


In [166]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score

cm=confusion_matrix(y_pred=y_pred,y_true=y_test)
acc=accuracy_score(y_pred=y_pred,y_true=y_test)
precision=precision_score(y_pred=y_pred,y_true=y_test)
recall=recall_score(y_pred=y_pred,y_true=y_test)
roc=roc_auc_score(y_score=y_pred_prob[:,1],y_true=y_test)

print(f"Accuracy: {acc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC: {roc}")
print(f"Confusion matrix: \n {cm}")

Accuracy: 0.9748743718592965
Precision: 0.9144385026737968
Recall: 0.9
ROC: 0.9694229338933369
Confusion matrix: 
 [[1187   16]
 [  19  171]]
