In [151]:
from os import walk, path
from gensim.models import KeyedVectors
from pandas import DataFrame, read_csv, concat
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
import numpy as np
from collections import defaultdict
from random import sample
from pickle import load

# Additional data

In [183]:
def check_occurrence(*words):
    for word in words:  
        if counts[word] < counts_threshold_lower or counts[word] >= counts_threshold_upper:
            return False
    return True

In [164]:
with open(path.join('..', 'embeddings-evaluation', 'word-analogy', 'counts.pickle'), 'rb') as f:
    counts = load(f)

# Load dataset

## 1. Word similarity

In [39]:
path_to_en_ru = path.join('..', 'word-benchmarks', 'intrinsic', 'word-similarity', 'cross-lingual', 'en-ru')

english_russian_word_similarity = [
                (path.join(path_to_en_ru, 'simverb-3500.csv'), 'SimVerb-3074'),
                (path.join(path_to_en_ru, 'men.csv'), 'MEN-1146'),
                (path.join(path_to_en_ru, 'rw.csv'), 'RareWord-968'),
                (path.join(path_to_en_ru, 'simlex999.csv'), 'SimLex-739'),
                (path.join(path_to_en_ru, 'mturk-771.csv'), 'MTurk-551'),
                (path.join(path_to_en_ru, 'semeval-2017.csv'), 'SemEval-243'),
                (path.join(path_to_en_ru, 'wordsim353-rel.csv'), 'WordRel-193'),
                (path.join(path_to_en_ru, 'wordsim353-sim.csv'), 'WordSim-193'),
                (path.join(path_to_en_ru, 'verb-143.csv'), 'Verb-115'),
                (path.join(path_to_en_ru, 'yp-130.csv'), 'YP-111'),
                (path.join('..', 'word-benchmarks', 'rg-65.csv'), 'RG-54'),
                (path.join('..', 'word-benchmarks', 'mc-30.csv'), 'MC-28'),
                ]

## 2. Word analogy

In [None]:
'../word-benchmarks/intrinsic/word-analogy/monolingual/en/'

In [205]:
temp_analogies = defaultdict(lambda: [])
ru_en_analogies = defaultdict(lambda: [])
en_ru_analogies = defaultdict(lambda: [])
type_id = 'type'
word_1_id = 'word1'
word_2_id = 'word2'
word_3_id = 'word3'

russian = read_csv(path.join('..', 'word-benchmarks', 'intrinsic', 'word-analogy', 'monolingual', \
                             'ru', 'google-analogies.csv'))
english = read_csv(path.join('..', 'word-benchmarks', 'intrinsic', 'word-analogy', 'monolingual', \
                             'en', 'google-analogies.csv'))
semantic_cats = russian.category.unique()[:5]

for value in english.type.unique():
    if value in semantic_cats:
        for ind, item in english.loc[english[type_id] == value].iterrows():
            ru_en_analogies[value].append((item[word_1_id].lower(), (russian.loc[ind][word_1_id].lower(),\
                                                                   russian.loc[ind][word_2_id].lower(),\
                                                                   russian.loc[ind][word_3_id].lower())))
            en_ru_analogies[value].append((russian.loc[ind][word_2_id].lower(), (item[word_1_id].lower(),\
                                                                               item[word_2_id].lower(),\
                                                                               item[word_3_id].lower())))          

In [206]:
counts_threshold_lower = 500
counts_threshold_upper = 10000

for key, value in ru_en_analogies.items():
    for item in value:
        target = item[0][0]
        word1 = item[1][0]
        word2 = item[1][1]
        word3 = item[1][2]
        if check_occurrence(word1, word2, word3):
            temp_analogies[key].append(item)

In [211]:
sample_amount = 150

ru_en_analogies = defaultdict(lambda: [])

for key, value in temp_analogies.items():
    if len(value) < sample_amount:
        print('{}: dropped'.format(key))
        continue
    ru_en_analogies[key] = sample(value, sample_amount)

family: dropped


In [212]:
length_dict = {key: len(value) for key, value in ru_en_analogies.items()}
length_dict

{'capital-common-countries': 150,
 'capital-world': 150,
 'currency': 150,
 'city-in-state': 150}

# Calculate extrinsic scores

In [28]:
def get_word_vector(word, model, vector_size=300):
    if len(word.split(' ')) == 1:
        return model[word]
    else:
        vector = np.zeros(shape=model.vector_size)
        for subword in word.split(' '):
            vector = np.add(vector, modefrom scipy.spatial import KDTreel[subword])
        return vector / len(word.split(' '))

In [29]:
def calculate_cosines(dataset, src_embeddings, trg_embeddings, verbose=False):
    dataset_len = len(dataset)
    vector_sims = []
    human_sims = []
    for i, m in dataset.iterrows():
        try:
            vector_sims.append(1 - cosine(get_word_vector(m['word1'].lower(), src_embeddings),
                                          get_word_vector(m['word2'].lower(), trg_embeddings)))
            human_sims.append(m['similarity'])
        except KeyError:
            continue
    if verbose:
        print('Percent of dropped = {:2.1f}%, amount of remanining words = {}'.format((dataset_len - len(human_sims))/dataset_len*100, len(human_sims)))
    return np.array(vector_sims), np.array(human_sims)


In [168]:
length_dict = {key: len(value) for key, value in ru_en_analogies.items()}
length_dict

{'capital-common-countries': 506,
 'capital-world': 4524,
 'currency': 866,
 'city-in-state': 2467,
 'family': 506}

In [149]:
sample_amount = 500
new_analogies = defaultdict(lambda: [])

for key, value in ru_en_analogies.items():
    new_analogies[key] = sample(value, 500)

# Main

In [138]:
values = defaultdict(lambda: defaultdict(lambda: []))
k = 10

for key, value in res_2.items():
    for key_2, value_2 in value.items():
        for key_3, value_3 in value_2.items():
            if int(key_3) == k:
                values[key][key_2] = value_3
                
results_k = DataFrame(values).transpose()

In [139]:
results_k.sort_values('capital-common-countries', ascending=False)

Unnamed: 0,capital-common-countries,capital-world,city-in-state,currency,family
bnc-araneum_upos_skipgram_300_2_2018,0.525692,0.124131,0.015734,0.09517,0.057143
wiki-araneum_upos_skipgram_300_2_2018,0.525692,0.124131,0.015734,0.09517,0.057143
googlenews-araneum_upos_skipgram_300_2_2018,0.525692,0.124131,0.015734,0.09517,0.057143
commoncrawl-araneum_upos_skipgram_300_2_2018,0.525692,0.124131,0.015734,0.09517,0.057143
gigaword-araneum_upos_skipgram_300_2_2018,0.525692,0.124131,0.015734,0.09517,0.057143
commoncrawl-web_upos_cbow_300_20_2017,0.347826,0.085743,0.006205,0.045906,0.028571
googlenews-web_upos_cbow_300_20_2017,0.347826,0.085743,0.006205,0.045906,0.028571
gigaword-web_upos_cbow_300_20_2017,0.347826,0.085743,0.006205,0.045906,0.028571
wiki-web_upos_cbow_300_20_2017,0.347826,0.085743,0.006205,0.045906,0.028571
bnc-web_upos_cbow_300_20_2017,0.347826,0.085743,0.006205,0.045906,0.028571


In [89]:
results = DataFrame()
res_2 = defaultdict(lambda: defaultdict(lambda: {}))

for fpath, _, files in walk('../cross-lang/'):
    for file in files:
        filename, file_extension = path.splitext(path.join(fpath, file))
        if file_extension == '.txt':
            language = filename.split('/')[~0]
            model_name = filename.split('/')[2]
            if language == 'vectors-ru':
                vectors_ru = KeyedVectors.load_word2vec_format(path.join(fpath, file))
            if language == 'vectors-en':
                vectors_en = KeyedVectors.load_word2vec_format(path.join(fpath, file))
                intrinsic_metrics = DataFrame({model_name: [spearmanr(*calculate_cosines(read_csv(dataset),
                                                                                          vectors_en,
                                                                                          vectors_ru,
                                                                                          verbose=False))[0]
                                                             for dataset, _ in english_russian_word_similarity]},
                                             index=[name for _, name in english_russian_word_similarity]).transpose()
                results = concat([intrinsic_metrics, results], axis= 0)
                print('{}: done\n'.format(model_name))
                p_at_k_list = [1, 5, 10]
                for p_at_k in p_at_k_list:
                    for key in ru_en_analogies.keys():
                        res = 0
                        amo = 0
                        for pair in ru_en_analogies[key]:
                            try:
                                vector = vectors_ru[pair[1][0]] - vectors_ru[pair[1][1]] + vectors_ru[pair[1][2]]
                                amo += 1
                                if pair[0] in [q[0].lower() for q in vectors_en.similar_by_vector(vector)[:p_at_k]]:
                                    res += 1
                            except:
                                pass
                        res_2[model_name][key][p_at_k] = res/amo

commoncrawl-araneum_upos_skipgram_300_2_2018: done

googlenews-news_upos_cbow_300_2_2017: done

gigaword-web_upos_cbow_300_20_2017: done

googlenews-ruscorpora_upos_skipgram_300_5_2018: done

gigaword-ruscorpora_upos_skipgram_300_5_2018: done

wiki-ruscorpora_upos_skipgram_300_5_2018: done

wiki-araneum_upos_skipgram_300_2_2018: done

googlenews-web_upos_cbow_300_20_2017: done

wiki-web_upos_cbow_300_20_2017: done

commoncrawl-taiga_upos_skipgram_300_2_2018: done

googlenews-taiga_upos_skipgram_300_2_2018: done

googlenews-ruwikiruscorpora-superbigrams_skipgram_300_2_2018: done

gigaword-araneum_upos_skipgram_300_2_2018: done

commoncrawl-news_upos_cbow_300_2_2017: done

gigaword-news_upos_cbow_300_2_2017: done

bnc-ruwikiruscorpora-superbigrams_skipgram_300_2_2018: done

bnc-news_upos_cbow_300_2_2017: done

bnc-taiga_upos_skipgram_300_2_2018: done

gigaword-taiga_upos_skipgram_300_2_2018: done

commoncrawl-ruwikiruscorpora-superbigrams_skipgram_300_2_2018: done

wiki-ruwikiruscorpora-

In [70]:
results.sort_values('SimLex-739', ascending=False)

Unnamed: 0,SimVerb-3074,MEN-1146,RareWord-968,SimLex-739,MTurk-551,SemEval-243,WordRel-193,WordSim-193,Verb-115,YP-111,RG-54,MC-28
commoncrawl-araneum_upos_skipgram_300_2_2018,0.141711,,0.442495,0.238718,0.541967,0.637047,0.530806,0.672836,0.341336,0.34453,0.441808,0.590257
wiki-araneum_upos_skipgram_300_2_2018,0.141711,,0.442495,0.238718,0.541967,0.637047,0.530806,0.672836,0.341336,0.34453,0.441808,0.590257
googlenews-araneum_upos_skipgram_300_2_2018,0.141711,,0.442495,0.238718,0.541967,0.637047,0.530806,0.672836,0.341336,0.34453,0.441808,0.590257
bnc-araneum_upos_skipgram_300_2_2018,0.141711,,0.442495,0.238718,0.541967,0.637047,0.530806,0.672836,0.341336,0.34453,0.441808,0.590257
gigaword-araneum_upos_skipgram_300_2_2018,0.141711,,0.442495,0.238718,0.541967,0.637047,0.530806,0.672836,0.341336,0.34453,0.441808,0.590257
bnc-taiga_upos_skipgram_300_2_2018,0.096937,,0.299269,0.217732,0.447814,0.48439,0.33325,0.527843,0.22958,0.221071,0.424951,0.532143
commoncrawl-taiga_upos_skipgram_300_2_2018,0.096937,,0.299269,0.217732,0.447814,0.48439,0.33325,0.527843,0.22958,0.221071,0.424951,0.532143
wiki-taiga_upos_skipgram_300_2_2018,0.096937,,0.299269,0.217732,0.447814,0.48439,0.33325,0.527843,0.22958,0.221071,0.424951,0.532143
googlenews-taiga_upos_skipgram_300_2_2018,0.096937,,0.299269,0.217732,0.447814,0.48439,0.33325,0.527843,0.22958,0.221071,0.424951,0.532143
gigaword-taiga_upos_skipgram_300_2_2018,0.096937,,0.299269,0.217732,0.447814,0.48439,0.33325,0.527843,0.22958,0.221071,0.424951,0.532143
