In [2]:
import json
import os

# result path
result_path = 'results/'
name_start = 'word2vec_results_'
name_end = '.json'


if not os.path.exists(result_path) or not os.path.isfile(result_path + 'final_word2vec_results.json'):
    try:
        os.makedirs(result_path)
    except:
        pass

    final_results = dict()

    print('Loading results from:')

    for file in os.listdir(result_path):
        if file.startswith(name_start) and file.endswith(name_end):
            print(f' - {file}')
            with open(result_path + file) as f:
                data = json.load(f)
                for key, value in data.items():
                    final_results[key] = value

    print('')
    print(f"Number of assesed word2vec models: {len(final_results)}")


    with open(result_path + 'final_word2vec_results.json', 'w') as fp:
        json.dump(final_results, fp)
else:
    print('Final results already exist, skipping...')
    with open(result_path + 'final_word2vec_results.json') as f:
        final_results = json.load(f)
    print('Imported final results from: ' + result_path + 'final_word2vec_results.json')
    print(f"Number of assesed word2vec models: {len(final_results)}")




Loading results from:
 - word2vec_results_500_1000.json
 - word2vec_results_10_20_50.json
 - word2vec_results_100_250.json

Number of assesed word2vec models: 48


In [3]:
import copy as cp
import numpy as np
word2vec_results = cp.deepcopy(final_results)

In [4]:
# for each word2vec model calculate mean auc
word2vec_mean_auc = {}

for model in word2vec_results:
    # get results for each model
    results = word2vec_results[model]
    # get auc for each model
    aucs = [results[model]['auc'] for model in results]
    # calculate mean auc
    mean_auc = np.mean(aucs)
    # add to dict
    word2vec_mean_auc[model] = mean_auc

In [None]:
# sort dict by mean auc in descending order
word2vec_mean_auc = dict(sorted(word2vec_mean_auc.items(), key=lambda item: item[1], reverse=True))
word2vec_mean_auc

In [6]:
# for each word2vec model calculate mean auc
word2vec_mean_f1 = {}

for model in word2vec_results:
    # get results for each model
    results = word2vec_results[model]
    # get auc for each model
    f1s = [results[model]['f1'] for model in results]
    # calculate mean auc
    mean_f1 = np.mean(f1s)
    # add to dict
    word2vec_mean_f1[model] = mean_f1

In [None]:
# sort dict by mean f1 in descending order
word2vec_mean_f1 = dict(sorted(word2vec_mean_f1.items(), key=lambda item: item[1], reverse=True))
word2vec_mean_f1

In [None]:
# get first 10 models by auc and their mean auc
top_10_auc = dict(list(word2vec_mean_auc.items())[:10])

# get first 10 models by f1 and their mean f1
top_10_f1 = dict(list(word2vec_mean_f1.items())[:10])




# get intersection of top 10 models by auc and f1
top_10_auc_f1 = set(top_10_auc.keys()).intersection(set(top_10_f1.keys()))
top_10_auc_f1

In [9]:
top_10_auc

{'word2vec_vs100_win6_sg0': 0.8435953104646113,
 'word2vec_vs100_win3_sg0': 0.8430424084527427,
 'word2vec_vs50_win8_sg1': 0.8423389781444494,
 'word2vec_vs50_win6_sg1': 0.8402431610942249,
 'word2vec_vs50_win3_sg1': 0.8389839339991315,
 'word2vec_vs100_win6_sg1': 0.8381010276451006,
 'word2vec_vs100_win7_sg1': 0.8379128672745694,
 'word2vec_vs50_win5_sg1': 0.8360631060935013,
 'word2vec_vs50_win7_sg1': 0.8351165146909828,
 'word2vec_vs50_win4_sg1': 0.8345230858300768}

In [10]:
top_10_f1

{'word2vec_vs100_win3_sg0': 0.8359361867078722,
 'word2vec_vs50_win8_sg1': 0.8351750041465993,
 'word2vec_vs50_win6_sg1': 0.8347857845171834,
 'word2vec_vs100_win6_sg0': 0.8345403045073183,
 'word2vec_vs50_win3_sg1': 0.8311635822280156,
 'word2vec_vs100_win6_sg1': 0.8303567277924101,
 'word2vec_vs100_win7_sg1': 0.830227169765107,
 'word2vec_vs50_win7_sg1': 0.830078893156708,
 'word2vec_vs50_win4_sg1': 0.8291217076372399,
 'word2vec_vs50_win5_sg1': 0.8266371236675608}