In [16]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from scipy.spatial.distance import cosine
from os import path
from scipy.stats import spearmanr

In [45]:
def get_vector_distance(word1, word2, model, num_features):
    try:
        if model == 'word2vec':
            return cosine(word2vec[word1], word2vec[word2])
        elif model == 'wang2vec':
            return cosine(wang2vec[word1], wang2vec[word2])
        elif model == 'glove':
            return cosine(wv(glove, word1), wv(glove, word2))
        elif model == 'word2vecf':
            return cosine(w2vf.word2vec(word1), w2vf.word2vec(word2))
        elif model == 'adagram':
            return cosine(ada_model.sense_vector(word1, get_adagram_sense_prob(ada_model, word1)),
                          ada_model.sense_vector(word2, get_adagram_sense_prob(ada_model, word2)))
        elif model == 'fasttext':
            return cosine(ft[word1], ft[word2])
        elif model == 'bow':
            return cosine(bow[word1], bow[word2])
        elif model == 'swivel':
            return cosine(swivel.lookup(word1), swivel.lookup(word2))
    except (KeyError, TypeError, ValueError, AttributeError):
        #print(word1, word2)
        return 0

In [43]:
def load_sim_dataset(name):
    df = read_csv(path.join('sim_datasets', '{}.csv'.format(name))).dropna()
    df.word1 = df.word1.apply(morph_parse)
    df.word2 = df.word2.apply(morph_parse)
    return df

Загрузка Word2Vec-модели

In [8]:
word2vec = Word2Vec.load('models/word2vec/all_lem_100')
word2vec_vocab = word2vec.wv.vocab

Загрузка Glove-модели

In [9]:
with open('models/glove/all_lem_100', 'rb') as fp:
    glove = load(fp)
glove_vocab = glove.dictionary

Загрузка Wang2Vec-модели

In [10]:
wang2vec = KeyedVectors.load_word2vec_format('models/wang2vec/all_lem_100_cwindow', binary=True)
wang2vec_vocab = wang2vec.vocab

Загрузка Word2Vec-f-модели

In [11]:
from os import path

w2vf = Word2VecF.load(path.join('models/word2vecf', 'vecs.npy'), path.join('models/word2vecf', 'vecs.vocab'))
w2vf_vocab = w2vf._vocab

Загрузка Adagram-модели

In [12]:
ada_model = adagram.VectorModel.load('models/adagram/all_lem_100.pkl')
adagram_vocab = ada_model.dictionary.word2id.keys()

Загрузка BOW-модели

In [13]:
from pickle import load

with open('models/tfidf/all_lem', 'rb') as fp:
    bow = load(fp)

Загрузка Swivel-модели

In [14]:
swivel = Swivel('models/swivel/vocab_100.txt', 'models/swivel/vecs_100.bin')
swivel_vocab = swivel.vocab

Загрузка Fasttext-модели

In [15]:
ft = FastText.load_word2vec_format('models/fasttext/all_100_skipgram.vec')
ft_vocab = ft.vocab

Получение датасетов

In [35]:
def make_sims_dataset(model, df):
    dim = 100
    if model == 'bow':
        dim = 1
    sims = np.zeros(shape=len(df), dtype='float32')
    for i, m in df.iterrows():
        sims[i] = get_vector_distance(m['word1'], m['word2'], model, dim)
    return sims

In [1]:
old_err_state = np.seterr(all='raise')

similarities = {'hj' : dict(), 'rt-test' : dict(), 'ae2-test' : dict()}

for name in {'hj', 'rt-test', 'ae2-test'}:
    for i in {'word2vec', 'glove', 'wang2vec', 'adagram', 'word2vecf', 'fasttext', 'bow', 'swivel'}:
        dataset = load_sim_dataset(name)
        similarities[name][i] = spearmanr(dataset.sim.values, make_sims_dataset(i, dataset))

NameError: name 'np' is not defined

In [47]:
similarities

{'ae2-test': {'adagram': SpearmanrResult(correlation=-0.044191642048389379, pvalue=0.015458423575035728),
  'bow': SpearmanrResult(correlation=nan, pvalue=nan),
  'fasttext': SpearmanrResult(correlation=-0.55784097845324077, pvalue=3.7062473768970375e-245),
  'glove': SpearmanrResult(correlation=-0.33243640384170514, pvalue=2.2345020884073807e-78),
  'swivel': SpearmanrResult(correlation=-0.54222064673484227, pvalue=4.2942952617796872e-229),
  'wang2vec': SpearmanrResult(correlation=-0.41997846923793491, pvalue=1.3435536616510378e-128),
  'word2vec': SpearmanrResult(correlation=-0.42410116113024593, pvalue=2.3215699024124498e-131),
  'word2vecf': SpearmanrResult(correlation=0.018254376440922811, pvalue=0.31739115804930479)},
 'hj': {'adagram': SpearmanrResult(correlation=-0.054845194889437253, pvalue=0.27503598850938871),
  'bow': SpearmanrResult(correlation=nan, pvalue=nan),
  'fasttext': SpearmanrResult(correlation=-0.50106463558793835, pvalue=1.095410375867721e-26),
  'glove': Spear