In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from os import path

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA

In [3]:
import matplotlib.pyplot as plt
import seaborn
import mpltex

In [4]:
def get_feature_vec(tokens, num_features, model, make_pca, make_sum):
    featureVec = np.zeros(shape=(1, num_features), dtype='float32')
    for word in tokens:
        if model == 'word2vec':
            featureVec = np.add(featureVec, word2vec[word])
        elif model == 'wang2vec':
            featureVec = np.add(featureVec, wang2vec[word])
        elif model == 'glove':
            featureVec = np.add(featureVec, wv(glove, word))
        elif model == 'word2vecf':
            featureVec = np.add(featureVec, w2vf.word2vec(word))
        elif model == 'adagram':
            featureVec = np.add(featureVec, ada_model.sense_vector(word, get_adagram_sense_prob(ada_model, word)))
        elif model == 'fasttext':
            featureVec = np.add(featureVec, ft[word])
        elif model == 'swivel':
            featureVec = np.add(featureVec, np.array(swivel.lookup(word)).squeeze())
    if len(tokens) == 0:
        return np.zeros(shape=(1, num_features), dtype='float32')
    else:
        return np.divide(featureVec, len(tokens))

In [5]:
old_err_state = np.seterr(all='raise')

def vectorize_message(message1, message2, model, num_features, vocab, make_pca=False, make_sum=False):
    tokens1 = make_tokens(message1.lower(), vocab)
    tokens2 = make_tokens(message2.lower(), vocab)
    fv1 = get_feature_vec(tokens1, num_features, model, make_pca, make_sum)
    fv2 = get_feature_vec(tokens2, num_features, model, make_pca, make_sum)
    if make_sum:
        return fv1.squeeze()+fv2.squeeze()/2
    if make_pca:
        try:
            pca = PCA(n_components=1)
            return pca.fit_transform((np.stack((fv1.squeeze(), fv2.squeeze())).T)).squeeze()
        except FloatingPointError:
            return np.zeros(shape=(1, num_features), dtype='float32')
    else:
        return np.hstack((fv1, fv2))

In [6]:
df = read_csv(path.join('sim_datasets', 'dsr.csv'))
df.post = df.post.apply(morph_parse)
df.op_post = df.op_post.apply(morph_parse)
Y = df.is_related.values

Загрузка Word2Vec-модели

In [7]:
word2vec = Word2Vec.load(path.join('models','word2vec','all_lem_100'))
word2vec_vocab = word2vec.wv.vocab

Загрузка Glove-модели

In [8]:
with open(path.join('models', 'glove' , 'all_lem_100'), 'rb') as fp:
    glove = load(fp)
glove_vocab = glove.dictionary

Загрузка Wang2Vec-модели

In [9]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', 'wang_skipngram'), binary=True)
wang2vec_vocab = wang2vec.vocab

Загрузка Word2Vec-f-модели

In [10]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', '2', 'vecs.npy'), path.join('models', 'word2vecf', '2', 'vecs.vocab'))
w2vf_vocab = w2vf._vocab

Загрузка Adagram-модели

In [11]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', 'out.pkl'))
adagram_vocab = ada_model.dictionary.word2id.keys()

Загрузка Swivel-модели

In [12]:
swivel = Swivel(path.join('models', 'swivel', '2chswivel.txt'), path.join('models', 'swivel', '2chswivel.bin'))
swivel_vocab = swivel.vocab

Загрузка Fasttext-модели

In [13]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '2ch_model_cbow.vec'))
ft_vocab = ft.vocab

Получение датасетов

In [14]:
def make_vectors_dataset(model, vocab, dim, make_pca=False, make_sum=False):
    multiplier = 2
    if make_pca or make_sum:
        multiplier = 1
    vectors = np.zeros(shape=(len(df), dim*multiplier), dtype='float32')
    for i, m in df.iterrows():
        vectors[i] = vectorize_message(m['post'], m['op_post'], model, dim, vocab, make_pca, make_sum)
    return vectors

In [15]:
vectors_con = dict()
vectors_sum = dict()
vectors_con_pca = dict()

for (model, dim, vocab) in [
                    ('word2vec', word2vec_vocab, 100),
                     ('glove', glove_vocab, 100),
                     ('wang2vec', wang2vec_vocab, 100),
                     ('adagram', adagram_vocab, 100),
                     ('word2vecf', w2vf_vocab, 100),
                     ('fasttext', ft_vocab, 100),
                     ('swivel', swivel_vocab, 100),
                     ]:
    vectors_con[model] = make_vectors_dataset(model, dim, vocab)
    vectors_con_pca[model] = make_vectors_dataset(model, dim, vocab, True)
    vectors_sum[model] = make_vectors_dataset(model, dim, vocab, False, True)

Сравнение

In [34]:
SIZE = 30

def set_plt_params(title):
    plt.rc('font',**{'family':'serif','serif':['Times']})
    #plt.rcParams["font.family"] = "cursive"
    plt.figure(figsize=(7, 2))
    plt.suptitle(title, fontsize=SIZE)
    plt.grid(False)
    plt.axes(frameon = 0)
    plt.tick_params(labelsize=SIZE)
    seaborn.set_style('white')
    plt.ylim([0.725, 0.855])

In [35]:
results = {'SUM': [], 'CON': [], 'CON+PCA' : []}

In [36]:
seaborn.set_style('white')

for NAME, vectors in [
                    ('SUM', vectors_sum),
                    ('CON', vectors_con),
                    ('CON+PCA', vectors_con_pca)
                    ]:
    set_plt_params(NAME)

    for name, markerstyle, colorstyle in [('glove', 'o', 'brown'),
                    ('word2vec', 'v', 'blue'),
                    ('wang2vec', '^', 'green'),
                    ('word2vecf', '<', 'red'),
                    ('adagram', '>', 'orange'),
                    ('fasttext', 'd', 'lightblue'),
                    ('swivel', 'p', 'olive'),
                    ]:
        estimator = KNeighborsClassifier(n_neighbors = 3, algorithm='brute', metric='cosine')
        cv = ShuffleSplit(n_splits=10, test_size=0.01, random_state=0)
        train_sizes=np.linspace(0.01, 0.99, 10)
        train_sizes, train_scores, test_scores = learning_curve(estimator, vectors[name], 
                                                                Y, cv=cv, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        results[NAME].append({'model' : name, 'score' : train_scores_mean})
        plt.plot(train_sizes, train_scores_mean, marker=markerstyle, markersize=15, label=name, linewidth=3, color=colorstyle)

    plt.grid(True, axis='y', linewidth=1, color='black')
    if NAME == 'CON+PCA':
        plt.legend(loc="upper left", bbox_to_anchor=(1,1), prop={'size':SIZE})
    plt.savefig('{}.png'.format(NAME), bbox_inches='tight')
    #plt.show()

  (prop.get_family(), self.defaultFamily[fontext]))
  (prop.get_family(), self.defaultFamily[fontext]))
  (prop.get_family(), self.defaultFamily[fontext]))


In [19]:
results

{'CON': [{'model': 'glove',
   'score': array([ 0.75769231,  0.80607029,  0.81983333,  0.82457723,  0.82759796,
           0.83422313,  0.83729977,  0.8385258 ,  0.84483204,  0.84722116])},
  {'model': 'word2vec',
   'score': array([ 0.77307692,  0.81980831,  0.83233333,  0.83359639,  0.83492334,
           0.84079398,  0.84370709,  0.84383292,  0.84345392,  0.84308164])},
  {'model': 'wang2vec',
   'score': array([ 0.78461538,  0.82492013,  0.83033333,  0.8313416 ,  0.83211244,
           0.84182067,  0.84748284,  0.84923833,  0.85099053,  0.85454197])},
  {'model': 'word2vecf',
   'score': array([ 0.76153846,  0.79233227,  0.79333333,  0.78996618,  0.79565588,
           0.79760438,  0.80291762,  0.80329238,  0.80766581,  0.81023381])},
  {'model': 'adagram',
   'score': array([ 0.78846154,  0.76389776,  0.788     ,  0.79470124,  0.79488927,
           0.79794661,  0.80108696,  0.80358722,  0.80676141,  0.8074358 ])},
  {'model': 'fasttext',
   'score': array([ 0.77692308,  0.8226837

Сравнение косинусного расстояния

In [20]:
cosine(ada_model.sense_vector('кошка', get_adagram_sense_prob(ada_model, 'кошка')),
                      ada_model.sense_vector('собака', get_adagram_sense_prob(ada_model, 'собака')))

0.93155136090587609

In [21]:
cosine(ft['кошка'], ft['собака'])

0.25567083116885791