In [None]:
# Source : https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381
# Data : https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [None]:
# !pip install gensim==4.2.0

In [None]:
# Read in the data and clean up column names
import gensim
print(gensim.__version__)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
pd.set_option('display.max_colwidth', 100)
messages = pd.read_csv('SelfShiksha_ANN_MCQ89_Word2Vec.csv', encoding='latin-1')

In [None]:
messages

In [None]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

In [None]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

In [None]:
# Encoding the label column
messages['label']=messages['label'].map({'ham':1,'spam':0})

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['text_clean'], messages['label'] , test_size=0.2)

In [None]:
# Train the word2vec model

# vector_size - size of the vectors we want

# window - number words before and after the focus word that it’ll consider as context for the word

# min_count - the number of times a word must appear in our corpus in order to create a word vector.

w2v_model = gensim.models.Word2Vec(X_train, vector_size = 100, window = 5, min_count = 2)

# This line trains the Word2Vec model using our X_train dataset.
# You can also use pre-trained Word2Vec vectors and compare how these perform 
# as compared to the above model : https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300

In [None]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

In [None]:
w2v_model.wv.similarity('life', 'death')

In [None]:
w2v_model.wv.similarity('hello', 'bye')

In [None]:
w2v_model.wv.similarity('hello', 'canada')

In [None]:
w2v_model.wv.similarity('cup', 'canada')

In [None]:
# This creates the embedding vector for each sentence in the dataset.

words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

In [None]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
# Write your own code to use Logistic Regression and ANN to do this classification.

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [None]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)


In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))