In [2]:
# Source : https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381
# Data : https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [1]:
!pip install gensim==4.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Read in the data and clean up column names
import gensim
print(gensim.__version__)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

4.2.0


In [3]:
pd.set_option('display.max_colwidth', 100)
messages = pd.read_csv('SelfShiksha_ANN_MCQ89_Word2Vec.csv', encoding='latin-1')

In [4]:
messages

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the å£750 Pound prize. 2 claim is eas...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other suggestions?",,,
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,,,


In [5]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [6]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [7]:
# Encoding the label column
messages['label']=messages['label'].map({'ham':1,'spam':0})

In [8]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['text_clean'], messages['label'] , test_size=0.2)

In [10]:
# Train the word2vec model

# vector_size - size of the vectors we want

# window - number words before and after the focus word that it’ll consider as context for the word

# min_count - the number of times a word must appear in our corpus in order to create a word vector.

w2v_model = gensim.models.Word2Vec(X_train, vector_size = 100, window = 5, min_count = 2)

# This line trains the Word2Vec model using our X_train dataset.
# You can also use pre-trained Word2Vec vectors and compare how these perform 
# as compared to the above model : https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300

In [11]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('always', 0.9956147074699402),
 ('would', 0.9955505132675171),
 ('gonna', 0.9954706430435181),
 ('yes', 0.995452344417572),
 ('over', 0.9954511523246765),
 ('might', 0.9954290390014648),
 ('dun', 0.9954077005386353),
 ('make', 0.9953897595405579),
 ('hav', 0.9953482151031494),
 ('wait', 0.9953464269638062)]

In [12]:
w2v_model.wv.similarity('life', 'death')

0.90162915

In [13]:
w2v_model.wv.similarity('hello', 'bye')

0.9854611

In [14]:
w2v_model.wv.similarity('hello', 'canada')

0.96220064

In [15]:
w2v_model.wv.similarity('cup', 'canada')

0.9574447

In [16]:
# This creates the embedding vector for each sentence in the dataset.

words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [17]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

2 2
2 2
7 7
3 3
6 6
8 8
19 15
10 10
8 8
30 29
21 19
29 29
7 7
12 12
17 17
0 0
7 7
16 16
13 13
4 3
13 13
28 26
6 6
15 14
22 22
6 6
8 8
27 27
5 5
5 5
9 7
22 19
30 29
4 4
1 1
8 7
24 22
21 21
6 5
30 28
13 12
14 14
5 5
2 2
12 12
8 8
4 4
13 10
13 13
4 4
21 16
8 8
21 21
23 23
4 4
30 27
12 8
17 16
22 22
19 16
16 15
27 26
17 17
68 53
31 30
22 22
12 12
14 14
17 17
2 2
6 6
1 1
27 26
22 22
3 3
15 15
7 7
15 15
5 5
19 17
27 25
12 12
9 9
8 8
5 3
26 25
23 21
17 14
21 18
4 4
7 7
6 6
5 5
11 10
10 10
8 7
23 23
3 3
10 8
15 15
30 29
25 20
13 12
10 10
4 4
11 11
12 12
20 17
11 11
28 19
8 8
14 12
20 17
13 13
12 11
5 5
19 19
5 5
12 12
10 10
9 9
11 10
14 12
6 6
26 25
22 22
4 4
27 27
8 8
27 27
9 9
17 15
13 12
12 10
17 16
8 6
4 4
9 9
26 25
8 8
27 23
8 7
8 7
17 15
12 11
6 6
22 22
11 11
6 6
6 5
12 12
5 5
4 4
18 16
27 27
6 6
4 4
21 20
17 17
10 9
74 71
28 24
16 16
23 23
9 6
6 6
9 9
7 7
25 24
11 10
7 7
9 9
23 18
13 13
30 30
7 7
24 23
5 4
5 5
5 5
26 26
31 29
18 18
12 9
7 5
41 16
8 8
21 21
8 8
17 16
3 3
46 40
11 9
18 16

In [18]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [19]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

2 100
2 100
7 100
3 100
6 100
8 100
19 100
10 100
8 100
30 100
21 100
29 100
7 100
12 100
17 100
0 100
7 100
16 100
13 100
4 100
13 100
28 100
6 100
15 100
22 100
6 100
8 100
27 100
5 100
5 100
9 100
22 100
30 100
4 100
1 100
8 100
24 100
21 100
6 100
30 100
13 100
14 100
5 100
2 100
12 100
8 100
4 100
13 100
13 100
4 100
21 100
8 100
21 100
23 100
4 100
30 100
12 100
17 100
22 100
19 100
16 100
27 100
17 100
68 100
31 100
22 100
12 100
14 100
17 100
2 100
6 100
1 100
27 100
22 100
3 100
15 100
7 100
15 100
5 100
19 100
27 100
12 100
9 100
8 100
5 100
26 100
23 100
17 100
21 100
4 100
7 100
6 100
5 100
11 100
10 100
8 100
23 100
3 100
10 100
15 100
30 100
25 100
13 100
10 100
4 100
11 100
12 100
20 100
11 100
28 100
8 100
14 100
20 100
13 100
12 100
5 100
19 100
5 100
12 100
10 100
9 100
11 100
14 100
6 100
26 100
22 100
4 100
27 100
8 100
27 100
9 100
17 100
13 100
12 100
17 100
8 100
4 100
9 100
26 100
8 100
27 100
8 100
8 100
17 100
12 100
6 100
22 100
11 100
6 100
6 100
12 100
5 10

In [23]:
# Instantiate and fit a basic Random Forest model on top of the vectors
# Write your own code to use Logistic Regression and ANN to do this classification.

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [24]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)


In [25]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.965 / Recall: 0.986 / Accuracy: 0.958
