## Tutorial on using word2vec for classification and visualisation.
Data included in repo classifies if a sentence is spam or ham.

 

In [1]:
#Import the following 
import pandas as pd 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.metrics import accuracy_score,f1_score 

#Custom transformer for w2v
import transformer as w2vt
#Custom plotter
import tsne as t


### Train the word2vec model and check output

## Load data . CSV contains ham or spam emails and is labelled.

In [2]:
# load data
txts = pd.read_csv("./spam.csv", encoding="latin-1")

# remove dirty columns
txts_clean = txts[["v1", "v2"]]

# rename the columns
txts_clean = txts_clean.rename(columns={"v1": "category", "v2": "message"})

# view
txts_clean.shape

(5572, 2)

## Split data into train and test for classification of emails into spam or ham.

In [3]:
#y 
#recode y as the standard 0, 1
y = txts_clean["category"].map({"ham": 0, "spam": 1})

#X
X = txts_clean["message"]
#Clean text.
X= X.str.replace('[^a-z ]', '').str.lower().str.split()

#Split data into train and test. Leave imbalance in class labels 
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.30, random_state=42)

#Check baseline
print(f" Baseline is {y_test.value_counts(normalize=True)[0]}") 


 Baseline is 0.8690191387559809


## Run word2vec(CBOW) upsteam and adaboost classifier downsteam.

In [7]:
#Word2vec Hyperparameters:
#Dimensions=300, Window=5,Sample =.1, sg=0, hs=1,alpha=0.025, iterations/epoochs=20).

#Define steps for pipeline
#Gensim model
gensim_word2vec_tr = w2vt.GensimWord2VecVectorizer(size=300,window=5, min_count=3,sample=.1, sg=0,hs=1, alpha=0.025, iter=20)
#Classifier
xgb = AdaBoostClassifier(n_estimators=1000, random_state=0)
 
#Define pipeline 
w2v_xgb = Pipeline([('w2v', gensim_word2vec_tr),('xgb', xgb)])

#Fit and score pipe
w2v_xgb.fit(X_train,y_train)
y_pred=w2v_xgb.predict(X_test)
print(f"Model accuracy {w2v_xgb.score(X_test,y_test)}") 
print(f"Model f1_score {f1_score(y_test,y_pred,average='weighted')}") 

#Assign w2v model to variable for similary scores
wv = w2v_xgb.named_steps['w2v'].model_.wv



Model accuracy 0.9748803827751196
Model f1_score 0.9743105322740244


## Test word2vec on word association task.

In [5]:
#What words are most similar, return similar words with respective cosine.
wv.most_similar(positive=['prize'])

[('won', 0.7462610602378845),
 ('award', 0.7102481126785278),
 ('guaranteed', 0.7018029689788818),
 ('iod', 0.6873871684074402),
 ('rize', 0.6749557852745056),
 ('gift', 0.6169447302818298),
 ('player', 0.6068623661994934),
 ('fantastic', 0.5967285633087158),
 ('ideo', 0.5869741439819336),
 ('okia', 0.5858338475227356)]