# Word2Vec baseline

Here we train a word2vec model to predict note helpfulness

**Credit:** Notebook was initially written by Michael Wang (@mwang14), and is merely
adapted here

In [1]:
import pandas as pd
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from collections import OrderedDict
from tqdm import tqdm
import nltk
import numpy as np
from sklearn.linear_model import LogisticRegression


from core_functions import evaluate_model



In [2]:
train_data = pd.read_csv('../data/processed/training_data.csv')
val_data = pd.read_csv('../data/processed/val_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

### Word2Vec Featurization

We'll load a pre-trained Word2Vec model from gensim. We see that the most similar
words make intuitive sense

In [3]:
corpus = api.load('text8')
wv_model = Word2Vec(corpus, workers=3)

In [4]:
sims = wv_model.wv.most_similar('tree', topn=10)
print(sims)

[('trees', 0.6944294571876526), ('leaf', 0.6876848340034485), ('bark', 0.651253879070282), ('flower', 0.6359479427337646), ('fruit', 0.6223763823509216), ('bird', 0.6040874719619751), ('avl', 0.6036485433578491), ('cactus', 0.5858634114265442), ('beetle', 0.5834983587265015), ('vault', 0.5765056014060974)]


In [5]:
datasets = OrderedDict([('train', {'notes': train_data.iloc[:,:22].drop_duplicates().reset_index(drop=True),
                                   'notes_ratings': train_data}),
                        ('val', {'notes': val_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                 'notes_ratings': val_data}),
                        ('test', {'notes': test_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                  'notes_ratings': test_data})])

In [6]:
def tokenize_text(text):
    tokens = [w.lower() for w in nltk.word_tokenize(text)]
    return tokens

for name, data_dict in tqdm(datasets.items(), total=3):
    notes_df = data_dict['notes']
    notes_ratings_df = data_dict['notes_ratings']
    tokenized_notes = [tokenize_text(text) for text in notes_df['summary']]
    note_arrays = list()
    for tokens in tqdm(tokenized_notes):
        note_vectors = []
        for t in tokens:
            if wv_model.wv.__contains__(t):
                note_vectors.append(wv_model.wv[t])
        if len(note_vectors) > 0:
            avg_note_vector = np.mean(note_vectors, axis=0)
        else:
            avg_note_vector = np.zeros(100)
        note_arrays.append(avg_note_vector)

    X_df = pd.DataFrame(np.array(note_arrays))
    X_df['noteId'] = notes_df['noteId']
    merged_X_df = X_df.merge(notes_ratings_df[['noteId', 'output', 'scaling']],
                             how='inner', on='noteId')
    X = merged_X_df.drop(['noteId', 'output', 'scaling'], axis=1)
    datasets[name]['X'] = X
    datasets[name]['Y'] = merged_X_df['output']
    datasets[name]['scaling'] = merged_X_df['scaling']
    datasets[name]['noteId'] = merged_X_df['noteId']
    print(name, 'size', merged_X_df.shape[0])


  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/8861 [00:00<?, ?it/s][A
 17%|█▋        | 1488/8861 [00:00<00:00, 14878.52it/s][A
 35%|███▍      | 3088/8861 [00:00<00:00, 15535.26it/s][A
 53%|█████▎    | 4686/8861 [00:00<00:00, 15736.66it/s][A
 71%|███████   | 6274/8861 [00:00<00:00, 15789.75it/s][A
100%|██████████| 8861/8861 [00:00<00:00, 15732.89it/s][A
 33%|███▎      | 1/3 [00:03<00:07,  3.64s/it]

train size 88185



100%|██████████| 1108/1108 [00:00<00:00, 15469.00it/s]
 67%|██████▋   | 2/3 [00:04<00:01,  1.78s/it]

val size 11641



100%|██████████| 1108/1108 [00:00<00:00, 15327.22it/s]
100%|██████████| 3/3 [00:04<00:00,  1.52s/it]

test size 10507





In [7]:
model = LogisticRegression(random_state=7)
model.fit(datasets['train']['X'], datasets['train']['Y'],
          sample_weight=datasets['train']['scaling'])

LogisticRegression(random_state=7)

In [8]:
for name, data_dict in datasets.items():
    print(name)
    predictions = model.predict(data_dict['X'])
    p_hat = model.predict_proba(data_dict['X'])[:, 1]
    observations = data_dict['Y']
    scaling = data_dict['scaling']
    performance_dict = evaluate_model(predictions, p_hat, observations,
                                      scaling, name)
    data_dict['Y_hat'] = predictions
    data_dict['p_hat'] = p_hat
    data_dict['performance'] = performance_dict

train
val
test


In [9]:
data_dict['X'].shape

(10507, 100)

In [9]:
performance_metrics = []
for name, data_dict in datasets.items():
    performance_metrics.append(data_dict['performance'])
performance_metrics_df = pd.DataFrame(performance_metrics)
performance_metrics_df

Unnamed: 0,accuracy,f1,precision,recall,roc-auc,avg_precision,name
0,0.652702,0.77989,0.658054,0.957092,0.618299,0.725287,train
1,0.649907,0.776325,0.652287,0.958613,0.601198,0.693242,val
2,0.64642,0.7773,0.652385,0.961378,0.614756,0.725418,test


## Feature Importances

In [10]:
wv_vector_df = pd.DataFrame(wv_model.wv.vectors)
wv_vector_df['label'] = np.asarray(wv_model.wv.index_to_key)


### Most positive coefficient

If we look at the highest coefficient in the model, we see words like election,
elections are heavily down-weighted in the model

In [11]:
wv_vector_df.sort_values(model.coef_.argmax()).head(10)[['label', model.coef_.argmax()]]

Unnamed: 0,label,24
2327,iso,-4.699834
149,g,-4.559182
193,international,-4.228397
121,c,-3.975081
2117,bwv,-3.869603
271,isbn,-3.742976
2269,directory,-3.672459
296,written,-3.65027
582,com,-3.513254
2285,java,-3.504974


In [12]:
wv_vector_df.sort_values(model.coef_.argmax()).tail(10)[['label', model.coef_.argmax()]]

Unnamed: 0,label,24
29,his,3.647378
1439,prevent,3.72391
39,were,3.834209
40,has,3.943205
377,took,4.049976
383,having,4.31465
588,our,4.416505
48,had,5.11627
38,have,6.274026
43,their,6.395884


### Most negative coefficient

We see 'www' in with a strong negative weight for the most negative feature, suggesting
the model is learning that links are typically helpful in notes.

In [13]:
wv_vector_df.sort_values(model.coef_.argmin()).head(10)[['label', model.coef_.argmin()]]

Unnamed: 0,label,33
271,isbn,-5.073486
67,see,-4.305952
1126,ed,-3.878332
1052,derived,-3.841801
499,w,-3.755247
1767,testament,-3.612944
77,between,-3.454426
1654,dictionary,-3.435673
62,into,-3.369842
713,classical,-3.346953


In [14]:
wv_vector_df.sort_values(model.coef_.argmin()).tail(10)[['label', model.coef_.argmin()]]

Unnamed: 0,label,33
448,foreign,3.782907
233,public,3.798503
1947,drugs,3.800926
392,islands,3.821885
1732,care,3.842179
888,enough,3.892266
929,difficult,3.950784
519,prime,3.980305
1807,risk,4.146846
53,been,4.360452
