In [5]:
import json
import os

# result path
result_path = 'results/'
name_start = 'word2vec_results_'
name_end = '.json'

final_results = dict()

print('Loading results from:')

for file in os.listdir(result_path):
    if file.startswith(name_start) and file.endswith(name_end):
        print(f' - {file}')
        with open(result_path + file) as f:
            data = json.load(f)
            for key, value in data.items():
                final_results[key] = value

print('')
print(f"Number of assesed word2vec models: {len(final_results)}")


with open(result_path + 'final_word2vec_results.json', 'w') as fp:
    json.dump(final_results, fp)



Loading results from:
 - word2vec_results_not_100_250_500.json
 - word2vec_results_others.json
 - word2vec_results_100_250_500.json
 - word2vec_results_top25.json

Number of assesed word2vec models: 96


In [8]:
import copy as cp
import numpy as np
word2vec_results = cp.deepcopy(final_results)

In [9]:
# for each word2vec model calculate mean auc
word2vec_mean_auc = {}

for model in word2vec_results:
    # get results for each model
    results = word2vec_results[model]
    # get auc for each model
    aucs = [results[model]['auc'] for model in results]
    # calculate mean auc
    mean_auc = np.mean(aucs)
    # add to dict
    word2vec_mean_auc[model] = mean_auc

In [10]:
# sort dict by mean auc in descending order
word2vec_mean_auc = dict(sorted(word2vec_mean_auc.items(), key=lambda item: item[1], reverse=True))
word2vec_mean_auc

{'word2vec_vs100_win7_sg0': 0.7890426879799324,
 'word2vec_vs100_win5_sg0': 0.787785489909264,
 'word2vec_vs100_win6_sg0': 0.7871360008180823,
 'word2vec_vs100_win4_sg0': 0.7870277434985007,
 'word2vec_vs100_win8_sg0': 0.7862109972027802,
 'word2vec_vs3000_win8_sg0': 0.7850329406013923,
 'word2vec_vs100_win6_sg1': 0.7849038763894657,
 'word2vec_vs2500_win8_sg0': 0.7848708740314725,
 'word2vec_vs3000_win6_sg0': 0.7848609972874838,
 'word2vec_vs2500_win7_sg0': 0.7842587764831479,
 'word2vec_vs1500_win8_sg0': 0.7841778288257945,
 'word2vec_vs2000_win7_sg0': 0.7839632595965677,
 'word2vec_vs2500_win5_sg0': 0.7837082987781279,
 'word2vec_vs100_win4_sg1': 0.7836175563082366,
 'word2vec_vs2500_win6_sg0': 0.7835401970020174,
 'word2vec_vs2000_win8_sg0': 0.7835381290029174,
 'word2vec_vs3000_win5_sg0': 0.7835348800820793,
 'word2vec_vs2500_win4_sg0': 0.7835087963114831,
 'word2vec_vs2000_win6_sg0': 0.7834710659543558,
 'word2vec_vs100_win8_sg1': 0.7834619912761486,
 'word2vec_vs250_win8_sg0': 0

In [11]:
# for each word2vec model calculate mean auc
word2vec_mean_f1 = {}

for model in word2vec_results:
    # get results for each model
    results = word2vec_results[model]
    # get auc for each model
    f1s = [results[model]['f1'] for model in results]
    # calculate mean auc
    mean_f1 = np.mean(f1s)
    # add to dict
    word2vec_mean_f1[model] = mean_f1

In [12]:
# sort dict by mean f1 in descending order
word2vec_mean_f1 = dict(sorted(word2vec_mean_f1.items(), key=lambda item: item[1], reverse=True))
word2vec_mean_f1

{'word2vec_vs100_win7_sg0': 0.7449460781542674,
 'word2vec_vs100_win5_sg0': 0.7433047138088177,
 'word2vec_vs100_win6_sg0': 0.742502174744267,
 'word2vec_vs100_win4_sg0': 0.7421776169800208,
 'word2vec_vs100_win8_sg0': 0.7411699781679632,
 'word2vec_vs100_win6_sg1': 0.7405598309689426,
 'word2vec_vs3000_win8_sg0': 0.740410508190622,
 'word2vec_vs3000_win6_sg0': 0.7402388439925077,
 'word2vec_vs2500_win8_sg0': 0.7402270885874529,
 'word2vec_vs1500_win8_sg0': 0.7393972017889998,
 'word2vec_vs2500_win7_sg0': 0.7392757976976251,
 'word2vec_vs2000_win7_sg0': 0.7391985465582737,
 'word2vec_vs2500_win5_sg0': 0.7386388719385656,
 'word2vec_vs100_win8_sg1': 0.7386360385086619,
 'word2vec_vs3000_win5_sg0': 0.7385523150708846,
 'word2vec_vs100_win4_sg1': 0.7385015759501656,
 'word2vec_vs2500_win6_sg0': 0.7384222805767682,
 'word2vec_vs2000_win8_sg0': 0.7384015725940302,
 'word2vec_vs2500_win4_sg0': 0.7383516710721029,
 'word2vec_vs2000_win6_sg0': 0.7382239256907638,
 'word2vec_vs250_win8_sg0': 0.

In [13]:
# get first 10 models by auc and their mean auc
top_10_auc = dict(list(word2vec_mean_auc.items())[:10])

# get first 10 models by f1 and their mean f1
top_10_f1 = dict(list(word2vec_mean_f1.items())[:10])




# get intersection of top 10 models by auc and f1
top_10_auc_f1 = set(top_10_auc.keys()).intersection(set(top_10_f1.keys()))
top_10_auc_f1

{'word2vec_vs100_win4_sg0',
 'word2vec_vs100_win5_sg0',
 'word2vec_vs100_win6_sg0',
 'word2vec_vs100_win6_sg1',
 'word2vec_vs100_win7_sg0',
 'word2vec_vs100_win8_sg0',
 'word2vec_vs2500_win8_sg0',
 'word2vec_vs3000_win6_sg0',
 'word2vec_vs3000_win8_sg0'}

In [14]:
top_10_auc

{'word2vec_vs100_win7_sg0': 0.7890426879799324,
 'word2vec_vs100_win5_sg0': 0.787785489909264,
 'word2vec_vs100_win6_sg0': 0.7871360008180823,
 'word2vec_vs100_win4_sg0': 0.7870277434985007,
 'word2vec_vs100_win8_sg0': 0.7862109972027802,
 'word2vec_vs3000_win8_sg0': 0.7850329406013923,
 'word2vec_vs100_win6_sg1': 0.7849038763894657,
 'word2vec_vs2500_win8_sg0': 0.7848708740314725,
 'word2vec_vs3000_win6_sg0': 0.7848609972874838,
 'word2vec_vs2500_win7_sg0': 0.7842587764831479}

In [15]:
top_10_f1

{'word2vec_vs100_win7_sg0': 0.7449460781542674,
 'word2vec_vs100_win5_sg0': 0.7433047138088177,
 'word2vec_vs100_win6_sg0': 0.742502174744267,
 'word2vec_vs100_win4_sg0': 0.7421776169800208,
 'word2vec_vs100_win8_sg0': 0.7411699781679632,
 'word2vec_vs100_win6_sg1': 0.7405598309689426,
 'word2vec_vs3000_win8_sg0': 0.740410508190622,
 'word2vec_vs3000_win6_sg0': 0.7402388439925077,
 'word2vec_vs2500_win8_sg0': 0.7402270885874529,
 'word2vec_vs1500_win8_sg0': 0.7393972017889998}

## WINNER - `word2vec_vs100_win7_sg0`
Since it was on average the best performing model for both auc and f1 on different predictive models, we will use this model to generate the word embeddings for the rest of the project.