In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from scipy.spatial.distance import cosine
from os import path
from scipy.stats import spearmanr
from sklearn.metrics import f1_score

In [2]:
def get_vector_distance(word1, word2, model, num_features):
    if model == 'word2vec':
        return 1 - cosine(word2vec[word1], word2vec[word2])
    elif model == 'wang2vec':
        return 1 - cosine(wang2vec[word1], wang2vec[word2])
    elif model == 'glove':
        return 1- cosine(wv(glove, word1), wv(glove, word2))
    elif model == 'word2vecf':
        return 1 - cosine(w2vf.word2vec(word1), w2vf.word2vec(word2))
    elif model == 'adagram':
        return 1 - cosine(ada_model.sense_vector(word1, get_adagram_sense_prob(ada_model, word1)),
                      ada_model.sense_vector(word2, get_adagram_sense_prob(ada_model, word2)))
    elif model == 'fasttext':
        return 1 - cosine(ft[word1], ft[word2])
    elif model == 'swivel':
        return 1 - cosine(np.array(swivel.lookup(word1)).squeeze(), np.array(swivel.lookup(word2)).squeeze())

In [3]:
def load_sim_dataset(name, verbose=False):
    df = read_csv(path.join('datasets', '{}.csv'.format(name))).dropna()
    old_len = len(df)
    df.word1 = df.word1.apply(morph_parse)
    df.word2 = df.word2.apply(morph_parse)
    for i, m in df.iterrows():
        if not m['word1'] in w2vf._vocab or not m['word2'] in w2vf._vocab:
            df.drop(i, inplace=True)
    if verbose:
        print('Percent of dropped = {}%'.format(len(df)/old_len*100))
    return df.reset_index(drop=True)

In [4]:
MODEL_NAME = '2ch_model'

Загрузка Word2Vec-модели

In [5]:
word2vec = Word2Vec.load(path.join('models','word2vec', MODEL_NAME))

Загрузка Glove-модели

In [6]:
with open(path.join('models', 'glove' , MODEL_NAME), 'rb') as fp:
    glove = load(fp)

Загрузка Wang2Vec-модели

In [7]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', MODEL_NAME), binary=True)

Загрузка Word2Vec-f-модели

In [8]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', '{}.npy'.format(MODEL_NAME)),\
                      path.join('models', 'word2vecf', '{}.vocab'.format(MODEL_NAME)))

Загрузка AdaGram-модели

In [9]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', '{}.pkl'.format(MODEL_NAME)))

Загрузка Swivel-модели

In [10]:
swivel = Swivel(path.join('models', 'swivel', '{}.txt'.format(MODEL_NAME)),\
                path.join('models', 'swivel', '{}.bin'.format(MODEL_NAME)))

Загрузка FastText-модели

In [11]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '{}.vec'.format(MODEL_NAME)))

Получение датасетов

In [12]:
def make_sims_dataset(model, df):
    dim = 100
    if model == 'bow':
        dim = 1
    sims = np.zeros(shape=len(df), dtype='float32')
    for i, m in df.iterrows():
        sims[i] = get_vector_distance(m['word1'], m['word2'], model, dim)
    return sims

In [13]:
similarities = {'hj' : dict(), 'rt-test' : dict(), 'ae2-test' : dict()}

for name in ['hj', 
             'rt-test', 
             'ae2-test'
            ]:
    dataset = load_sim_dataset(name, True)
    for i in ['word2vec', 'glove', 'wang2vec', 'adagram', 'word2vecf', 'fasttext', 'swivel']:
        model_sims = make_sims_dataset(i, dataset)
        if name != 'hj':
            model_sims = [1 if sim > 0.5 else 0 for sim in model_sims]
            similarities[name][i] = '%.2f' % f1_score(dataset.sim.values, model_sims)
        else:
            similarities[name][i] = '%.2f' % spearmanr(dataset.sim.values, model_sims)[0]

Percent of dropped = 26.884422110552762%
Percent of dropped = 4.7444490992878094%
Percent of dropped = 16.289140572951368%


In [14]:
similarities

{'ae2-test': {'adagram': '0.04',
  'fasttext': '0.49',
  'glove': '0.37',
  'swivel': '0.16',
  'wang2vec': '0.69',
  'word2vec': '0.68',
  'word2vecf': '0.71'},
 'hj': {'adagram': '0.13',
  'fasttext': '0.50',
  'glove': '0.42',
  'swivel': '0.59',
  'wang2vec': '0.52',
  'word2vec': '0.60',
  'word2vecf': '-0.02'},
 'rt-test': {'adagram': '0.08',
  'fasttext': '0.45',
  'glove': '0.17',
  'swivel': '0.12',
  'wang2vec': '0.68',
  'word2vec': '0.53',
  'word2vecf': '0.75'}}