In [19]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from scipy.spatial.distance import cosine

In [31]:
def get_vector_distance(word1, word2, model, num_features):
    try:
        if model == 'word2vec':
            return cosine(word2vec[word1], word2vec[word2])
        elif model == 'wang2vec':
            return cosine(wang2vec[word1], wang2vec[word2])
        elif model == 'glove':
            return cosine(wv(glove, word1), wv(glove, word2))
        elif model == 'word2vecf':
            return cosine(w2vf.word2vec(word1), w2vf.word2vec(word2))
        elif model == 'adagram':
            return cosine(ada_model.sense_vector(word1, get_adagram_sense_prob(ada_model, word1)),
                          ada_model.sense_vector(word2, get_adagram_sense_prob(ada_model, word2)))
        elif model == 'fasttext':
            return cosine(ft[word1], ft[word2])
        elif model == 'bow':
            return cosine(bow[word1], bow[word2])
        elif model == 'swivel':
            return cosine(swivel.lookup(word1), swivel.lookup(word2))
    except (KeyError, AttributeError):
        return 0

In [8]:
df = read_csv('relatedness.csv')
df.word1 = df.word1.apply(morph_parse)
df.word2 = df.word2.apply(morph_parse)

Загрузка Word2Vec-модели

In [10]:
#word2vec = Word2Vec.load('models/2ch_word2vec')
#word2vec_vocab = word2vec.wv.vocab

word2vec = Word2Vec.load('models/2ch_word2vec_all_lem')
word2vec_vocab = word2vec.wv.vocab

Загрузка Glove-модели

In [11]:
with open('models/2ch_glove_all_lem_100', 'rb') as fp:
    glove = load(fp)
glove_vocab = glove.dictionary

Загрузка Wang2Vec-модели

In [12]:
wang2vec = KeyedVectors.load_word2vec_format('models/wang2vec_2ch', binary=True)
wang2vec_vocab = wang2vec.vocab

Загрузка Word2Vec-f-модели

In [13]:
from os import path

w2vf = Word2VecF.load(path.join('models', 'w2vf.npy'), path.join('models', 'w2vf.vocab'))
w2vf_vocab = w2vf._vocab

Загрузка Adagram-модели

In [14]:
ada_model = adagram.VectorModel.load('models/adagram_all_lem_100.pkl')
adagram_vocab = ada_model.dictionary.word2id.keys()

Загрузка BOW-модели

In [15]:
from pickle import load

with open('models/bow.pickle', 'rb') as fp:
    bow = load(fp)

Загрузка Swivel-модели

In [16]:
swivel = Swivel('models/swivel.txt', 'models/swivel.bin')
swivel_vocab = swivel.vocab

Загрузка Fasttext-модели

In [17]:
ft = FastText.load_word2vec_format('models/fasttext2.vec')
ft_vocab = ft.vocab

Загрузка Wordrank-модели

In [18]:
#wr = Wordrank()
#mod = Wordrank.load_wordrank_model('/media/defeater/d5233b78-d9c7-40e7-a454-8b108bcc4b8a/defeater/NLP/wordrank/meta/cooccurrence')
#wr = wr.train('/media/defeater/d5233b78-d9c7-40e7-a454-8b108bcc4b8a/defeater/NLP/wordrank', corpus_file='corpus.txt', out_path='/media/defeater/d5233b78-d9c7-40e7-a454-8b108bcc4b8a/defeater/NLP/wordrank')

Получение датасетов

In [60]:
from scipy.stats import spearmanr

In [70]:
def make_sims_dataset(model):
    sims = np.zeros(shape=len(df), dtype='float32')
    for i, m in df.iterrows():
        sims[i] = get_vector_distance(m['word1'], m['word2'], model, 100)
    return sims

In [71]:
similarities = dict()

for i in {'word2vec', 'glove', 'wang2vec', 'adagram', 'word2vecf', 'fasttext', 'bow'}:
    similarities[i] = spearmanr(df.sim.values, make_sims_dataset(i))

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [72]:
similarities

{'adagram': SpearmanrResult(correlation=0.032360790148491532, pvalue=0.61058528983577554),
 'bow': SpearmanrResult(correlation=nan, pvalue=nan),
 'fasttext': SpearmanrResult(correlation=-0.41225619811550562, pvalue=1.1209673718948913e-11),
 'glove': SpearmanrResult(correlation=-0.13821090729523278, pvalue=0.028899794197869175),
 'wang2vec': SpearmanrResult(correlation=-0.03818111085409457, pvalue=0.5479132026817185),
 'word2vec': SpearmanrResult(correlation=-0.35163288119534769, pvalue=1.0937687109453958e-08),
 'word2vecf': SpearmanrResult(correlation=nan, pvalue=nan)}

Сравнение

In [56]:
for sim in similarities.items():
    print(sim[0])
    print(np.mean(sim[1]))

adagram
-0.608272
bow
0.288403
glove
-0.246234
word2vecf
0.288403
word2vec
-0.21406
fasttext
-0.285401
wang2vec
-0.0402759
