In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from scipy.spatial.distance import cosine
from os import path
from scipy.stats import spearmanr

In [2]:
def get_vector_distance(word1, word2, model, num_features):
    if model == 'word2vec':
        return cosine(word2vec[word1], word2vec[word2])
    elif model == 'wang2vec':
        return cosine(wang2vec[word1], wang2vec[word2])
    elif model == 'glove':
        return cosine(wv(glove, word1), wv(glove, word2))
    elif model == 'word2vecf':
        return cosine(w2vf.word2vec(word1), w2vf.word2vec(word2))
    elif model == 'adagram':
        return cosine(ada_model.sense_vector(word1, get_adagram_sense_prob(ada_model, word1)),
                      ada_model.sense_vector(word2, get_adagram_sense_prob(ada_model, word2)))
    elif model == 'fasttext':
        return cosine(ft[word1], ft[word2])
    elif model == 'swivel':
        return cosine(np.array(swivel.lookup(word1)).squeeze(), np.array(swivel.lookup(word2)).squeeze())

In [3]:
def load_sim_dataset(name):
    df = read_csv(path.join('sim_datasets', '{}.csv'.format(name))).dropna()
    old_len = len(df)
    df.word1 = df.word1.apply(morph_parse)
    df.word2 = df.word2.apply(morph_parse)
    for i, m in df.iterrows():
        if not m['word1'] in w2vf._vocab or not m['word2'] in w2vf._vocab:
            df.drop(i, inplace=True)
    print('Percent of dropped = {}%'.format(len(df)/old_len*100))
    return df.reset_index(drop=True)

Загрузка Word2Vec-модели

In [4]:
word2vec = Word2Vec.load(path.join('models','word2vec','all_lem_100'))

Загрузка Glove-модели

In [5]:
with open(path.join('models', 'glove' , 'all_lem_100'), 'rb') as fp:
    glove = load(fp)

Загрузка Wang2Vec-модели

In [6]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', 'wang_skipngram'), binary=True)

Загрузка Word2Vec-f-модели

In [7]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', 'vecs.npy'), path.join('models', 'word2vecf', 'vecs.vocab'))

Загрузка AdaGram-модели

In [8]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', 'out.pkl'))

Загрузка Swivel-модели

In [9]:
swivel = Swivel(path.join('models', 'swivel', '2chswivel.txt'), path.join('models', 'swivel', '2chswivel.bin'))

Загрузка FastText-модели

In [10]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '2ch_model_cbow.vec'))

Получение датасетов

In [11]:
def make_sims_dataset(model, df):
    dim = 100
    if model == 'bow':
        dim = 1
    sims = np.zeros(shape=len(df), dtype='float32')
    for i, m in df.iterrows():
        sims[i] = get_vector_distance(m['word1'], m['word2'], model, dim)
    return sims

In [12]:
similarities = {'hj' : dict(), 'rt-test' : dict(), 'ae2-test' : dict()}

for name in {'hj', 
             'rt-test', 
             'ae2-test'
            }:
    for i in {'word2vec', 'glove', 'wang2vec', 'adagram', 'word2vecf', 'fasttext', 'swivel'}:
        dataset = load_sim_dataset(name)
        similarities[name][i] = spearmanr(dataset.sim.values, make_sims_dataset(i, dataset))

Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 16.289140572951368%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 26.884422110552762%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 4.7444490992878094%


In [13]:
similarities

{'ae2-test': {'adagram': SpearmanrResult(correlation=-0.17863865480002925, pvalue=7.1216756701817543e-05),
  'fasttext': SpearmanrResult(correlation=-0.69825387833519659, pvalue=1.0265987087050723e-72),
  'glove': SpearmanrResult(correlation=-0.59817897281091748, pvalue=8.7959421062475066e-49),
  'swivel': SpearmanrResult(correlation=-0.71758189964645935, pvalue=1.5702874368056979e-78),
  'wang2vec': SpearmanrResult(correlation=-0.46212600352049282, pvalue=3.0513603247699464e-27),
  'word2vec': SpearmanrResult(correlation=-0.6918791243183795, pvalue=6.737307836130957e-71),
  'word2vecf': SpearmanrResult(correlation=-0.14990859902560708, pvalue=0.0008832159285140132)},
 'hj': {'adagram': SpearmanrResult(correlation=-0.12932290276615055, pvalue=0.18431139248593839),
  'fasttext': SpearmanrResult(correlation=-0.50189579399408679, pvalue=3.6414231125651821e-08),
  'glove': SpearmanrResult(correlation=-0.41504495583029738, pvalue=8.7875622872992301e-06),
  'swivel': SpearmanrResult(correlat