In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from os import path

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA

In [3]:
import matplotlib.pyplot as plt
import seaborn
import mpltex

In [4]:
df = read_csv(path.join('datasets', '2sr.csv'))
df.post = df.post.apply(morph_parse)
df.op_post = df.op_post.apply(morph_parse)

In [5]:
MODEL_NAME = '2ch_model'

Загрузка Word2Vec-модели

In [6]:
word2vec = Word2Vec.load(path.join('models','word2vec', MODEL_NAME))
word2vec_vocab = word2vec.wv.vocab

In [7]:
old_len = len(df)

def vectorize_message(message1, message2, model, num_features, vocab, make_pca=False, make_sum=False):
    tokens1 = make_tokens(message1.lower(), vocab)
    tokens2 = make_tokens(message2.lower(), vocab)
    if len(tokens1) == 0 or len(tokens2) == 0:
        return False
    else:
        return True

for i, m in df.iterrows():
    if not vectorize_message(m['post'], m['op_post'], word2vec, 100, word2vec_vocab):
        df.drop(i, inplace=True)
print('Percent of dropped = {:2.1f}%'.format((old_len - len(df))/old_len*100))
df = df.reset_index(drop=True)
Y = df.is_related.values

Percent of dropped = 0.5%


In [8]:
def get_feature_vec(tokens, num_features, model, make_pca, make_sum):
    featureVec = np.zeros(shape=(1, num_features), dtype='float32')
    for word in tokens:
        if model == 'word2vec':
            featureVec = np.add(featureVec, word2vec[word])
        elif model == 'wang2vec':
            featureVec = np.add(featureVec, wang2vec[word])
        elif model == 'glove':
            featureVec = np.add(featureVec, glove[word])
        elif model == 'word2vecf':
            try:
                featureVec = np.add(featureVec, w2vf.word2vec(word))
            except KeyError:
                pass
        elif model == 'adagram':
            featureVec = np.add(featureVec, ada_model.sense_vector(word, get_adagram_sense_prob(ada_model, word)))
        elif model == 'fasttext':
            featureVec = np.add(featureVec, ft[word])
        elif model == 'swivel':
            try:
                featureVec = np.add(featureVec, np.array(swivel.lookup(word)).squeeze())
            except TypeError:
                pass
    return np.divide(featureVec, len(tokens))

In [9]:
old_err_state = np.seterr(all='raise')

def vectorize_message(message1, message2, model, num_features, vocab, make_pca=False, make_sum=False):
    tokens1 = make_tokens(message1.lower(), vocab)
    tokens2 = make_tokens(message2.lower(), vocab)
    fv1 = get_feature_vec(tokens1, num_features, model, make_pca, make_sum)
    fv2 = get_feature_vec(tokens2, num_features, model, make_pca, make_sum)
    if make_sum:
        return fv1.squeeze()+fv2.squeeze()/2
    if make_pca:
        pca = PCA(n_components=1)
        return pca.fit_transform((np.stack((fv1.squeeze(), fv2.squeeze())).T)).squeeze()
    else:
        return np.hstack((fv1, fv2))

Загрузка Glove-модели

In [10]:
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    return model

In [11]:
glove = loadGloveModel(path.join('models', 'glove' , '{}.txt'.format(MODEL_NAME)))

Загрузка Wang2Vec-модели

In [12]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', MODEL_NAME), binary=True)

Загрузка Word2Vec-f-модели

In [13]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', '{}.npy'.format(MODEL_NAME)),\
                      path.join('models', 'word2vecf', '{}.vocab'.format(MODEL_NAME)))

Загрузка Adagram-модели

In [14]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', '{}.pkl'.format(MODEL_NAME)))

Загрузка Swivel-модели

In [15]:
swivel = Swivel(path.join('models', 'swivel', '{}.txt'.format(MODEL_NAME)),\
                path.join('models', 'swivel', '{}.bin'.format(MODEL_NAME)))

Загрузка Fasttext-модели

In [16]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '{}.vec'.format(MODEL_NAME)))

Получение датасетов

In [17]:
def make_vectors_dataset(model, dim, vocab, make_pca=False, make_sum=False):
    multiplier = 2
    if make_pca or make_sum:
        multiplier = 1
    vectors = np.zeros(shape=(len(df), dim*multiplier), dtype='float32')
    for i, m in df.iterrows():
        vectors[i] = vectorize_message(m['post'], m['op_post'], model, dim, vocab, make_pca, make_sum)
    return vectors

In [18]:
vectors_con = dict()
vectors_sum = dict()
vectors_con_pca = dict()

for (model, dim) in [
                    ('word2vec', 100),
                     ('glove', 100),
                     ('wang2vec', 100),
                     ('adagram', 100),
                     ('word2vecf', 100),
                     ('fasttext', 100),
                     ('swivel', 100),
                     ]:
    vectors_con[model] = make_vectors_dataset(model, dim, word2vec_vocab)
    vectors_con_pca[model] = make_vectors_dataset(model, dim, word2vec_vocab, True)
    vectors_sum[model] = make_vectors_dataset(model, dim, word2vec_vocab, False, True)

Сравнение

In [19]:
SIZE = 25
FIGSIZE = (10, 5)
PLOT_LIMITS = [0.725, 0.855]

def set_plt_params(title):
    plt.rc('font',**{'family':'serif','serif':['Times']})
    plt.figure(figsize=FIGSIZE)
    plt.suptitle(title, fontsize=SIZE)
    plt.grid(False)
    plt.axes(frameon = 0)
    plt.tick_params(labelsize=SIZE)
    plt.ylim(PLOT_LIMITS)

In [20]:
results = {'SUM': [], 'CON': [], 'CON+PCA' : []}

In [21]:
seaborn.set_style('white')

CROSS_VAL_FOLDS = 10
TEST_CHUNK = 0.01
CROSS_VAL_CHUNK = 0.99

for NAME, vectors in [
                    ('SUM', vectors_sum),
                    ('CON', vectors_con),
                    ('CON+PCA', vectors_con_pca)
                    ]:
    set_plt_params(NAME)

    for name, markerstyle, colorstyle in [('glove', 'o', 'brown'),
                    ('word2vec', 'v', 'blue'),
                    ('wang2vec', '^', 'green'),
                    ('word2vecf', '<', 'red'),
                    ('adagram', '>', 'orange'),
                    ('fasttext', 'd', 'lightblue'),
                    ('swivel', 'p', 'olive'),
                    ]:
        estimator = KNeighborsClassifier(n_neighbors = 3, algorithm='brute', metric='cosine')
        cv = ShuffleSplit(n_splits=CROSS_VAL_FOLDS, test_size=TEST_CHUNK, random_state=0)
        train_sizes=np.linspace(TEST_CHUNK, CROSS_VAL_CHUNK, CROSS_VAL_FOLDS)
        train_sizes, train_scores, test_scores = learning_curve(estimator, vectors[name], 
                                                                Y, cv=cv, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        results[NAME].append({'model' : name, 'score' : '%.3f' % train_scores_mean[-1]})
        plt.plot(train_sizes, train_scores_mean, marker=markerstyle, markersize=15, label=name, linewidth=3, color=colorstyle)

    plt.grid(True, axis='y', linewidth=1, color='black')
    if NAME == 'CON+PCA':
        plt.legend(loc='upper left', bbox_to_anchor=(1,1), prop={'size':SIZE})
    plt.savefig('{}.png'.format(NAME), bbox_inches='tight')
    #plt.show()

  (prop.get_family(), self.defaultFamily[fontext]))
  (prop.get_family(), self.defaultFamily[fontext]))
  (prop.get_family(), self.defaultFamily[fontext]))


In [23]:
results

{'CON': [{'model': 'glove', 'score': '0.847'},
  {'model': 'word2vec', 'score': '0.852'},
  {'model': 'wang2vec', 'score': '0.850'},
  {'model': 'word2vecf', 'score': '0.787'},
  {'model': 'adagram', 'score': '0.819'},
  {'model': 'fasttext', 'score': '0.854'},
  {'model': 'swivel', 'score': '0.851'}],
 'CON+PCA': [{'model': 'glove', 'score': '0.831'},
  {'model': 'word2vec', 'score': '0.831'},
  {'model': 'wang2vec', 'score': '0.840'},
  {'model': 'word2vecf', 'score': '0.809'},
  {'model': 'adagram', 'score': '0.790'},
  {'model': 'fasttext', 'score': '0.841'},
  {'model': 'swivel', 'score': '0.842'}],
 'SUM': [{'model': 'glove', 'score': '0.834'},
  {'model': 'word2vec', 'score': '0.836'},
  {'model': 'wang2vec', 'score': '0.839'},
  {'model': 'word2vecf', 'score': '0.782'},
  {'model': 'adagram', 'score': '0.805'},
  {'model': 'fasttext', 'score': '0.832'},
  {'model': 'swivel', 'score': '0.839'}]}

Сравнение косинусного расстояния

In [31]:
1 - cosine(ada_model.sense_vector('кошка', get_adagram_sense_prob(ada_model, 'кошка')),
                      ada_model.sense_vector('собака', get_adagram_sense_prob(ada_model, 'собака')))

0.33010898578441095

In [24]:
1 - cosine(ft['кошка'], ft['собака'])

0.74432916883114209

In [25]:
1 - cosine(glove['кошка'], glove['собака'])

0.62079673550182068

In [26]:
1 - cosine(word2vec['кошка'], word2vec['собака'])

0.8286360924350209

In [27]:
1 - cosine(wang2vec['кошка'], wang2vec['собака'])

0.80360189848035224