In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from os import path
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

In [22]:
def get_vector_distance(word1, word2, model,):
    if model == 'word2vec':
        return 1 - cosine(word2vec[word1], word2vec[word2])
    elif model == 'wang2vec':
        return 1 - cosine(wang2vec[word1], wang2vec[word2])
    elif model == 'glove':
        return 1- cosine(glove[word1], glove[word2])
    elif model == 'fasttext':
        return 1 - cosine(ft[word1], ft[word2])
    elif model == 'swivel':
        return 1 - cosine(np.array(swivel.lookup(word1)).squeeze(), np.array(swivel.lookup(word2)).squeeze())
    elif model == 'word2vecf':
        return 1 - cosine(w2vf.word2vec(word1), w2vf.word2vec(word2))
    elif model == 'adagram':
        return 1 - cosine(ada_model.sense_vector(word1, get_adagram_sense_prob(ada_model, word1)),
                      ada_model.sense_vector(word2, get_adagram_sense_prob(ada_model, word2)))

In [27]:
def load_sim_dataset_f(name, verbose=False):
    df = read_csv(path.join('datasets', '{}.csv'.format(name))).dropna()
    old_len = len(df)
    df.word1 = df.word1.apply(morph_parse)
    df.word2 = df.word2.apply(morph_parse)
    for i, m in df.iterrows():
        if not m['word1'] in w2vf._vocab or not m['word2'] in w2vf._vocab:
            df.drop(i, inplace=True)
    if verbose:
        print('Percent of dropped = {:2.1f}%'.format((old_len - len(df))/old_len*100))
    return df.reset_index(drop=True)

In [24]:
def load_sim_dataset(name, verbose=False):
    df = read_csv(path.join('datasets', '{}.csv'.format(name))).dropna()
    old_len = len(df)
    df.word1 = df.word1.apply(morph_parse)
    df.word2 = df.word2.apply(morph_parse)
    for i, m in df.iterrows():
        if not m['word1'] in swivel.vocab or not m['word2'] in swivel.vocab:
            df.drop(i, inplace=True)
    if verbose:
        print('Percent of dropped = {:2.1f}%'.format((old_len - len(df))/old_len*100))
    return df.reset_index(drop=True)

In [4]:
MODEL_NAME = '2ch_model'

Загрузка Word2Vec-модели

In [5]:
word2vec = Word2Vec.load(path.join('models','word2vec', MODEL_NAME))

Загрузка Glove-модели

In [6]:
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    return model

In [7]:
glove = loadGloveModel(path.join('models', 'glove' , '{}.txt'.format(MODEL_NAME)))

Загрузка Wang2Vec-модели

In [8]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', MODEL_NAME), binary=True)

Загрузка Swivel-модели

In [9]:
swivel = Swivel(path.join('models', 'swivel', '{}.txt'.format(MODEL_NAME)),\
                path.join('models', 'swivel', '{}.bin'.format(MODEL_NAME)))

Загрузка FastText-модели

In [10]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '{}.vec'.format(MODEL_NAME)))

Загрузка Word2Vecf-модели

In [16]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', '{}.npy'.format(MODEL_NAME)),\
                      path.join('models', 'word2vecf', '{}.vocab'.format(MODEL_NAME)))

Загрузка Adagram-модели

In [12]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', '{}.pkl'.format(MODEL_NAME)))

Получение датасетов

In [13]:
def make_sims_dataset(model, df):
    dim = 100
    if model == 'bow':
        dim = 1
    sims = np.zeros(shape=len(df), dtype='float32')
    for i, m in df.iterrows():
        sims[i] = get_vector_distance(m['word1'], m['word2'], model)
    return sims

In [25]:
similarities = {'hj' : dict(), 'rt-test' : dict(), 'ae2-test' : dict()}

for name in ['hj', 
             'rt-test', 
             'ae2-test'
            ]:
    dataset = load_sim_dataset(name, True)
    for i in ['word2vec', 'glove', 'wang2vec', 'adagram', 'fasttext', 'swivel']:
        model_sims = make_sims_dataset(i, dataset)
        if name != 'hj':
            model_sims = [1 if sim > 0.5 else 0 for sim in model_sims]
            similarities[name][i] = '%.2f' % average_precision_score(dataset.sim.values, model_sims)
        else:
            similarities[name][i] = '%.2f' % spearmanr(dataset.sim.values, model_sims)[0]

Percent of dropped = 5.5%
Percent of dropped = 40.9%
Percent of dropped = 9.4%


In [28]:
for name in ['hj', 
             'rt-test', 
             'ae2-test'
            ]:
    dataset = load_sim_dataset_f(name, True)
    for i in ['word2vecf']:
        model_sims = make_sims_dataset(i, dataset)
        if name != 'hj':
            model_sims = [1 if sim > 0.5 else 0 for sim in model_sims]
            similarities[name][i] = '%.2f' % average_precision_score(dataset.sim.values, model_sims)
        else:
            similarities[name][i] = '%.2f' % spearmanr(dataset.sim.values, model_sims)[0]

Percent of dropped = 27.4%
Percent of dropped = 74.0%
Percent of dropped = 38.0%


In [29]:
similarities

{'ae2-test': {'adagram': '0.66',
  'fasttext': '0.79',
  'glove': '0.77',
  'swivel': '0.76',
  'wang2vec': '0.78',
  'word2vec': '0.80',
  'word2vecf': '0.74'},
 'hj': {'adagram': '0.11',
  'fasttext': '0.44',
  'glove': '0.40',
  'swivel': '0.52',
  'wang2vec': '0.41',
  'word2vec': '0.51',
  'word2vecf': '0.04'},
 'rt-test': {'adagram': '0.57',
  'fasttext': '0.76',
  'glove': '0.74',
  'swivel': '0.74',
  'wang2vec': '0.72',
  'word2vec': '0.68',
  'word2vecf': '0.73'}}