# Word2Vec baseline

Here we train a word2vec model to predict note helpfulness

**Credit:** Notebook was initially written by Michael Wang (@mwang14), and is merely
adapted here

In [11]:
import pandas as pd
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from collections import OrderedDict
from tqdm import tqdm
import nltk
import numpy as np
from sklearn.linear_model import LogisticRegression


from core_functions import evaluate_model

In [2]:
train_data = pd.read_csv('../data/processed/training_data.csv')
val_data = pd.read_csv('../data/processed/val_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

### Word2Vec Featurization

We'll load a pre-trained Word2Vec model from gensim. We see that the most similar
words make intuitive sense

In [64]:
corpus = api.load('text8')
wv_model = Word2Vec(corpus, workers=3)

In [4]:
sims = wv_model.wv.most_similar('tree', topn=10)
print(sims)

[('trees', 0.7089941501617432), ('leaf', 0.6803917288780212), ('bark', 0.6572981476783752), ('cactus', 0.621181845664978), ('avl', 0.6153997182846069), ('flower', 0.6134554743766785), ('bird', 0.6078888773918152), ('fruit', 0.599010705947876), ('nest', 0.5607653260231018), ('whale', 0.5602169632911682)]


In [7]:
datasets = OrderedDict([('train', {'notes': train_data.iloc[:,:22].drop_duplicates().reset_index(drop=True),
                                   'notes_ratings': train_data}),
                        ('val', {'notes': val_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                 'notes_ratings': val_data}),
                        ('test', {'notes': test_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                  'notes_ratings': test_data})])

In [25]:
def tokenize_text(text):
    tokens = [w.lower() for w in nltk.word_tokenize(text)]
    return tokens

for name, data_dict in tqdm(datasets.items(), total=3):
    notes_df = data_dict['notes']
    notes_ratings_df = data_dict['notes_ratings']
    tokenized_notes = [tokenize_text(text) for text in notes_df['summary']]
    note_arrays = list()
    for tokens in tqdm(tokenized_notes):
        note_vectors = []
        for t in tokens:
            if wv_model.wv.__contains__(t):
                note_vectors.append(wv_model.wv[t])
        if len(note_vectors) > 0:
            avg_note_vector = np.mean(note_vectors, axis=0)
        else:
            avg_note_vector = np.zeros(100)
        note_arrays.append(avg_note_vector)

    X_df = pd.DataFrame(np.array(note_arrays))
    X_df['noteId'] = notes_df['noteId']
    merged_X_df = X_df.merge(notes_ratings_df[['noteId', 'output', 'scaling']],
                             how='inner', on='noteId')
    X = merged_X_df.drop(['noteId', 'output', 'scaling'], axis=1)
    datasets[name]['X'] = X
    datasets[name]['Y'] = merged_X_df['output']
    datasets[name]['scaling'] = merged_X_df['scaling']
    datasets[name]['noteId'] = merged_X_df['noteId']
    print(name, 'size', merged_X_df.shape[0])


  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/8861 [00:00<?, ?it/s][A
 13%|█▎        | 1172/8861 [00:00<00:00, 11713.20it/s][A
 31%|███       | 2704/8861 [00:00<00:00, 13831.11it/s][A
 49%|████▊     | 4309/8861 [00:00<00:00, 14842.69it/s][A
 66%|██████▌   | 5855/8861 [00:00<00:00, 15085.63it/s][A
 83%|████████▎ | 7364/8861 [00:00<00:00, 14927.08it/s][A
100%|██████████| 8861/8861 [00:00<00:00, 14551.83it/s][A
 33%|███▎      | 1/3 [00:04<00:08,  4.21s/it]

train size 88185



100%|██████████| 1108/1108 [00:00<00:00, 15096.00it/s]
 67%|██████▋   | 2/3 [00:04<00:02,  2.02s/it]

val size 11641



100%|██████████| 1108/1108 [00:00<00:00, 13598.11it/s]
100%|██████████| 3/3 [00:05<00:00,  1.73s/it]

test size 10507





In [31]:
model = LogisticRegression(random_state=7)
model.fit(datasets['train']['X'], datasets['train']['Y'],
          sample_weight=datasets['train']['scaling'])

LogisticRegression(random_state=7)

In [58]:
for name, data_dict in datasets.items():
    print(name)
    predictions = model.predict(data_dict['X'])
    p_hat = model.predict_proba(data_dict['X'])[:, 1]
    observations = data_dict['Y']
    scaling = data_dict['scaling']
    performance_dict = evaluate_model(predictions, p_hat, observations,
                                      scaling, name)
    data_dict['Y_hat'] = predictions
    data_dict['p_hat'] = p_hat
    data_dict['performance'] = performance_dict

train
accuracy
f1
precision
recall
roc-auc
avg_precision
val
accuracy
f1
precision
recall
roc-auc
avg_precision
test
accuracy
f1
precision
recall
roc-auc
avg_precision


In [60]:
performance_metrics = []
for name, data_dict in datasets.items():
    performance_metrics.append(data_dict['performance'])
performance_metrics_df = pd.DataFrame(performance_metrics)
performance_metrics_df

Unnamed: 0,accuracy,f1,precision,recall,roc-auc,avg_precision,name
0,0.651488,0.779139,0.657384,0.956246,0.618056,0.724974,train
1,0.646021,0.774362,0.64962,0.958395,0.600257,0.695015,val
2,0.647839,0.778194,0.653135,0.962484,0.613216,0.726344,test


## Feature Importances

In [72]:
wv_vector_df = pd.DataFrame(wv_model.wv.vectors)
wv_vector_df['label'] = np.asarray(wv_model.wv.index_to_key)


### Most positive coefficient

If we look at the highest coefficient in the model, we see words like election,
elections are heavily down-weighted in the model

In [74]:
wv_vector_df.sort_values(model.coef_.argmax()).head(10)[['label', model.coef_.argmax()]]

Unnamed: 0,label,74
743,election,-6.89918
920,elections,-6.662499
396,days,-6.032875
445,rights,-5.748539
812,months,-5.572523
1317,hours,-5.53118
80,years,-5.365267
1996,weeks,-5.199754
609,season,-5.141959
48,had,-5.022526


In [75]:
wv_vector_df.sort_values(model.coef_.argmax()).tail(10)[['label', model.coef_.argmax()]]

Unnamed: 0,label,74
160,based,3.241172
2787,cuisine,3.304648
2696,verb,3.336894
1122,composer,3.357516
967,variety,3.364342
2460,painter,3.368794
837,dutch,3.412879
1669,hydrogen,3.803331
1279,acid,4.240408
541,italian,4.411085


### Most negative coefficient

We see 'www' in with a strong negative weight for the most negative feature, suggesting
the model is learning that links are typically helpful in notes.

In [77]:
wv_vector_df.sort_values(model.coef_.argmin()).head(10)[['label', model.coef_.argmin()]]

Unnamed: 0,label,19
321,km,-4.762671
455,minister,-4.423274
303,air,-4.380177
158,links,-4.10375
555,est,-4.064242
1153,www,-3.929491
2112,marine,-3.911918
1637,zealand,-3.803357
1487,iv,-3.780884
392,islands,-3.748726


In [78]:
wv_vector_df.sort_values(model.coef_.argmin()).tail(10)[['label', model.coef_.argmin()]]

Unnamed: 0,label,19
2419,audience,3.201761
168,she,3.2084
45,first,3.241337
276,house,3.298155
245,old,3.473671
57,new,3.582438
639,african,3.71384
35,he,3.745849
56,who,4.027489
27,from,4.20245
