In [None]:
from pandas import read_csv, Series
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from pickle import load
from glove import Glove
import adagram
from gensim.models.wrappers import FastText, Wordrank
from embed_utils import Word2VecF, Swivel, cosine_sim, get_adagram_sense_prob, wv
from utils.string_utils import morph_parse, make_tokens
from os import path

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt
import seaborn
import mpltex

In [None]:
def get_feature_vec(tokens, num_features, model, make_pca, make_sum):
    featureVec = np.zeros(shape=(1, num_features), dtype='float32')
    for word in tokens:
        if model == 'word2vec':
            featureVec = np.add(featureVec, word2vec[word])
        elif model == 'wang2vec':
            featureVec = np.add(featureVec, wang2vec[word])
        elif model == 'glove':
            featureVec = np.add(featureVec, wv(glove, word))
        elif model == 'word2vecf':
            featureVec = np.add(featureVec, w2vf.word2vec(word))
        elif model == 'adagram':
            featureVec = np.add(featureVec, ada_model.sense_vector(word, get_adagram_sense_prob(ada_model, word)))
        elif model == 'fasttext':
            featureVec = np.add(featureVec, ft[word])
        elif model == 'swivel':
            featureVec = np.add(featureVec, np.array(swivel.lookup(word)).squeeze())
    if len(tokens) == 0:
        return np.zeros(shape=(1, num_features), dtype='float32')
    else:
        return np.divide(featureVec, len(tokens))

In [None]:
old_err_state = np.seterr(all='raise')

def vectorize_message(message1, message2, model, num_features, vocab, make_pca=False, make_sum=False):
    tokens1 = make_tokens(message1.lower(), vocab)
    tokens2 = make_tokens(message2.lower(), vocab)
    fv1 = get_feature_vec(tokens1, num_features, model, make_pca, make_sum)
    fv2 = get_feature_vec(tokens2, num_features, model, make_pca, make_sum)
    if make_sum:
        return fv1.squeeze()+fv2.squeeze()/2
    if make_pca:
        try:
            pca = PCA(n_components=1)
            return pca.fit_transform((np.stack((fv1.squeeze(), fv2.squeeze())).T)).squeeze()
        except FloatingPointError:
            return np.zeros(shape=(1, num_features), dtype='float32')
    else:
        return np.hstack((fv1, fv2))

In [None]:
df = read_csv(path.join('sim_datasets', 'dsr.csv'), encoding='cp1251').dropna()
df.post = df.post.apply(morph_parse)
df.op_post = df.op_post.apply(morph_parse)

In [None]:
#df_unl = read_csv('anno2ch/annotated.csv', encoding='cp1251')

In [None]:
Y = df.is_related.values

Загрузка Word2Vec-модели

In [None]:
word2vec = Word2Vec.load(path.join('models','word2vec','all_lem_100'))
word2vec_vocab = word2vec.wv.vocab

Загрузка Glove-модели

In [None]:
with open(path.join('models', 'glove' , 'all_lem_100'), 'rb') as fp:
    glove = load(fp)
glove_vocab = glove.dictionary

Загрузка Wang2Vec-модели

In [None]:
wang2vec = KeyedVectors.load_word2vec_format(path.join('models', 'wang2vec', 'wang_skipngram'), binary=True)
wang2vec_vocab = wang2vec.vocab

Загрузка Word2Vec-f-модели

In [None]:
w2vf = Word2VecF.load(path.join('models', 'word2vecf', 'vecs.npy'), path.join('models', 'word2vecf', 'vecs.vocab'))
w2vf_vocab = w2vf._vocab

Загрузка Adagram-модели

In [None]:
ada_model = adagram.VectorModel.load(path.join('models', 'adagram', 'out.pkl'))
adagram_vocab = ada_model.dictionary.word2id.keys()

Загрузка TF-IDF-модели

In [None]:
with open(path.join('models', 'tfidf', 'all_lem'), 'rb') as fp:
    bow = load(fp)

Загрузка Swivel-модели

In [None]:
swivel = Swivel(path.join('models', 'swivel', '2chswivel.txt'), path.join('models', 'swivel', '2chswivel.bin'))
swivel_vocab = swivel.vocab

Загрузка Fasttext-модели

In [None]:
ft = FastText.load_word2vec_format(path.join('models', 'fasttext', '2ch_model_cbow.vec'))
ft_vocab = ft.vocab

Получение датасетов

In [None]:
def make_vectors_dataset(model, vocab, dim, make_pca=False, make_sum=False):
    multiplier = 2
    if make_pca or make_sum:
        multiplier = 1
    vectors = np.zeros(shape=(len(df), dim*multiplier), dtype='float32')
    for i, m in df.iterrows():
        vectors[i] = vectorize_message(m['post'], m['op_post'], model, dim, vocab, make_pca, make_sum)
    return vectors

In [None]:
vectors_con = dict()
vectors_sum = dict()
vectors_con_pca = dict()

for (model, dim, vocab) in [
                    ('word2vec', word2vec_vocab, 100),
                     ('glove', glove_vocab, 100),
                     ('wang2vec', wang2vec_vocab, 100),
                     ('adagram', adagram_vocab, 100),
                     ('word2vecf', w2vf_vocab, 100),
                     ('fasttext', ft_vocab, 100),
                     ('swivel', swivel_vocab, 100),
                     ]:
    vectors_con[model] = make_vectors_dataset(model, dim, vocab)
    vectors_con_pca[model] = make_vectors_dataset(model, dim, vocab, True)
    vectors_sum[model] = make_vectors_dataset(model, dim, vocab, False, True)

Сравнение

In [None]:
def set_plt_params(title):
    title_font = {'size':'80', 'color':'black', 'weight':'normal',
                  'verticalalignment':'bottom'} 
    axis_font = {'size':'80'}
    plt.figure(figsize=(20, 10))
    plt.suptitle(title, fontsize=50)
    plt.grid(False)
    plt.axes(frameon = 0)
    plt.tick_params(labelsize=40)
    #plt.ylim([0.72, 0.8])

In [None]:
set_plt_params()
results = []

seaborn.set_style('white')

for name, markerstyle, colorstyle in [('glove', 'o', 'brown'),
                ('word2vec', 'v', 'blue'),
                ('wang2vec', '^', 'green'),
                ('word2vecf', '<', 'red'),
                ('adagram', '>', 'orange'),
                ('fasttext', 'd', 'magenta'),
                ('bow', 'p', 'lightskyblue'),
                ('swivel', '.', 'olive'),
                ]:
    estimator = KNeighborsClassifier(n_neighbors = 3, algorithm='brute', metric='cosine')
    cv = ShuffleSplit(n_splits=10, test_size=0.01, random_state=0)
    train_sizes=np.linspace(0.01, 0.99, 10)
    train_sizes, train_scores, test_scores = learning_curve(estimator, vectors_mo[name], 
                                                            Y, cv=cv, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    results.append({'model' : name, 'score' : train_scores_mean})
    plt.plot(train_sizes, train_scores_mean, marker=markerstyle, markersize=18, label=name, linewidth=5, color=colorstyle)

plt.grid(True, axis='y', linewidth=1, color='black')
plt.savefig('classifiers_concat.png', bbox_inches='tight')
plt.legend(loc='best')
plt.show()

In [None]:
set_plt_params('CON+PCA')
results = []

seaborn.set_style('white')

for name, markerstyle, colorstyle in [('glove', 'o', 'brown'),
                ('word2vec', 'v', 'blue'),
                ('wang2vec', '^', 'green'),
                ('word2vecf', '<', 'red'),
                ('adagram', '>', 'orange'),
                ('fasttext', 'd', 'magenta'),
                ('swivel', '.', 'olive'),
                ]:
    estimator = KNeighborsClassifier(n_neighbors = 3, algorithm='brute', metric='cosine')
    cv = ShuffleSplit(n_splits=10, test_size=0.01, random_state=0)
    train_sizes=np.linspace(0.01, 0.99, 10)
    train_sizes, train_scores, test_scores = learning_curve(estimator, vectors_con_pca[name], 
                                                            Y, cv=cv, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    results.append({'model' : name, 'score' : train_scores_mean})
    plt.plot(train_sizes, train_scores_mean, marker=markerstyle, markersize=18, label=name, linewidth=5, color=colorstyle)

plt.grid(True, axis='y', linewidth=1, color='black')
plt.savefig('classifiers_sum.png', bbox_inches='tight')
plt.legend(loc='best')
plt.show()