In [2]:
import os
import numpy as np
import pickle


GLOVE_DIR = 'glove.6B.100d.txt'
glove_dict = {}
with open(GLOVE_DIR) as f:
    for line in f:
        word, *vector = line.split()
        glove_dict[word]=np.array(vector).astype(float)

In [3]:
re_tokenized = pickle.load(open('re_tokenized_lemma.pkl','rb'))

In [5]:
def get_average_glove_vec(list_of_tokens):
    avg_vec=[]
    for token in list_of_tokens:
        if token in glove_dict:
            avg_vec.append(glove_dict[token])

    if len(avg_vec)>0:
        avg_vec = np.mean(avg_vec, axis=0)
    else:
        avg_vec = np.zeros(100)

    return avg_vec

In [6]:
import pandas as pd
from tqdm import tqdm

df_original = pd.read_csv("WikiLarge_Train.csv")
df_vecs = []


for ls in tqdm(re_tokenized):
    avg_vec = get_average_glove_vec(ls)
    df_vecs.append(avg_vec)

df_original['glove_avg'] = df_vecs

100%|██████████| 416768/416768 [00:09<00:00, 43102.19it/s]


In [55]:
from sklearn.model_selection import train_test_split

df = df_original.sample(10000, random_state=42)
X = df['glove_avg']
y = df['label']
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X,y,test_size=0.2, random_state=42)
# save for later complex NN
train_indexes = Train_X.index
test_indexes = Test_X.index
Train_X = np.vstack(Train_X)
Test_X = np.vstack(Test_X)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

rf = RandomForestClassifier(bootstrap= True,
 max_depth=70,
 max_features='auto',
 min_samples_leaf=4,
 min_samples_split=10,
 n_estimators=800)

rf.fit(Train_X, Train_Y)
rf_preds = rf.predict(Test_X)
rf_acc = accuracy_score(Test_Y, rf_preds)
rf_f1 = f1_score(Test_Y, rf_preds)
rf_acc, rf_f1

In [12]:
pickle.dump(rf, open('trained_models/rf_glove_1w.pkl', 'wb'))

In [53]:
rf_acc, rf_f1

(64.75, 66.12205670350792)

# SVM

In [13]:
from sklearn import svm

SVM = svm.SVC(C=0.5, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X,Train_Y)

# consider as failure
predictions_SVM = SVM.predict(Test_X)
svm_acc = accuracy_score(Test_Y, predictions_SVM)*100
svm_f1 = f1_score(Test_Y, predictions_SVM)*100
svm_acc, svm_f1

(56.05, 59.66039467645709)

In [45]:
train_score = SVM.score(Train_X, Train_Y)*100

In [46]:
train_score

59.775

In [14]:
pickle.dump(SVM, open('trained_models/svm_glove_1w.pkl', 'wb'))

In [15]:
SVM2 = svm.SVC(C=0.5, kernel='rbf', degree=3, gamma='auto')
SVM2.fit(Train_X,Train_Y)

# consider as failure
predictions_SVM2 = SVM2.predict(Test_X)
svm_acc2 = accuracy_score(Test_Y, predictions_SVM2)*100
svm_f12 = f1_score(Test_Y, predictions_SVM2)*100
svm_acc2, svm_f12

(60.050000000000004, 66.72219908371511)

# MLP

In [18]:
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import f1_score, accuracy_score

mlp = MLPClassifier(hidden_layer_sizes=(150), random_state=1, max_iter=300).fit(Train_X, Train_Y)
mlp_predict = mlp.predict(Test_X)
mlp_acc = accuracy_score(Test_Y, mlp_predict)
mlp_f1 = f1_score(Test_Y, mlp_predict)
mlp_acc, mlp_f1



(0.601, 0.5856697819314642)

In [47]:
mlp_train_score = mlp.score(Train_X, Train_Y)

In [48]:
mlp_train_score

0.9695

### This is obvious overfitting, try parameter tuning later

In [19]:
pickle.dump(mlp, open('trained_models/mlp_glove_1w.pkl', 'wb'))

# Summary

In [25]:
# mlp_acc = mlp_acc*100
# mlp_f1 = mlp_f1*100
# rf_acc = rf_acc*100
# rf_f1 = rf_f1*100

In [58]:
print('    accuracy\t f1 score')
print(' mlp:', round(mlp_acc,3),'\t', mlp_f1, 
      '\n SVM:', svm_acc,'\t', svm_f1, 
      '\n rf: ',rf_acc,'\t', rf_f1)

    accuracy	 f1 score
 mlp: 60.1 	 58.566978193146426 
 SVM: 56.05 	 59.66039467645709 
 rf:  64.75 	 66.12205670350792


In [42]:
score_df = pd.DataFrame([[mlp_acc, svm_acc, rf_acc],[mlp_f1, svm_f1, rf_f1]]).T
score_df.columns=['accuracy', 'f1 score']
score_df.index = ['MLP','SVM','RandomForest']

In [43]:
score_df.to_csv('scores_1w_Glove.csv')