#  word2vec, doc2vec и fasttext 


1. word2vec – векторное представление слова
2. как сделать вектор документа?
    * усреднить все вектора слов
    * усреднить все вектора слов с $tf-idf$ весами
    * doc2vec
3. fasttext – векторное представление $n$-грам


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import numpy as np
from sklearn.metrics import *
from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
random.seed(1228)

%matplotlib inline

Загружаем лемматизированные статьи без стоп-слов и создаем массив текстов

In [2]:
from pymystem3 import Mystem
import re


m = Mystem()


regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""



def lemmatize(text, mystem=m):
    try:
        return "".join(m.lemmatize(text)).strip()  
    except:
        return " "


In [None]:
df_neg = pd.read_csv('datasets/nlp/negative.csv', sep=';', header=None, usecols=[3])
df_pos = pd.read_csv('datasets/nlp/positive.csv', sep=';', header=None, usecols=[3])
df_neg['sent'] = 'neg'
df_pos['sent'] = 'pos'
df = pd.concat([df_neg, df_pos])
df = df[:1000]
df.columns = ['text', 'sent']
df.text = df.text.apply(words_only)
df.text = df.text.apply(lemmatize)

In [None]:
df_pos[3].tolist()[:10]
# df_neg[3].tolist()[:10]

In [None]:
df.head()

In [None]:
texts = [df.text.iloc[i].split() for i in range(len(df))]

## Обучение модели в gensim

In [None]:
df.text[0]

In [None]:
%%time
from gensim.models import Word2Vec
model = Word2Vec(texts, size=100, window=5, min_count=1, workers=6)
model.save('sent_w2v.model')

In [None]:
# summarize the loaded model
print(model)

In [None]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

Загружаем обученную модель

In [None]:
# from gensim.models import Word2Vec
# model.load('sent_w2v.model')

In [None]:
model.wv.most_similar(":|")

In [None]:
model.wv.most_similar(positive=['хорошо', 'плохой'], negative=['хороший'])

In [None]:
model.wv.doesnt_match('борщ сметана макароны пирожок консомэ кошка'.split())

Визуализация пространства слов

In [None]:
top_words = []
from nltk import FreqDist
fd = FreqDist()
for text in texts:
    fd.update(text)
for i in fd.most_common(1000):
    top_words.append(i[0])
    
print(top_words)

In [None]:
top_words_vec = model[top_words]

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=0)
top_words_tsne = tsne.fit_transform(top_words_vec)

In [None]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools='pan, wheel_zoom, reset, save',
           toolbar_location='above',
           title='word2vec T-SNE for most common words')
source = ColumnDataSource(data=dict(x1=top_words_tsne[:, 0],
                                    x2=top_words_tsne[:, 1],
                                    names=top_words
                                   ))
p.scatter(x='x1', y='x2', size=8, source=source)

labels = LabelSet(x='x1', y='x2', text='names', y_offset=6,
                  text_font_size='8pt', text_color='#555555',
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

### Кластеризация слов

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

dist = 1 - cosine_similarity(top_words_vec)

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist)

fig, ax = plt.subplots(figsize=(10, 100))
ax = dendrogram(linkage_matrix, orientation='right', labels=top_words);

plt.tick_params(\
                axis='x', # changes apply to x-axis
                which='both', # both major and minor ticks are affected
                bottom='off', # ticks along the bottom edge are off
                top='off', # ticks along the top edge are off
                labelbottom='off')
plt.tight_layout()
plt.savefig('w2_clusters.png', dpi=200) #save figure as ward_clusters

## Классификация текстов 

По мотивам http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [None]:
X = df.text.tolist()
y = df.sent.tolist()

X, y = np.array(X), np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)
print(f'total train examples {len(y_train)}')
print(f'total test examples {len(y_test)}')

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionalityas all the other vectors
        self.dim = len(w2v.popitem()[1])
    
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                   or [np.zeros(self.dim)], axis=0)
            for word in X
        ])

In [None]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(w2v.popitem()[1])
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(x)
        max_idf = max(tfidf.idf_)
        self.word2weight = datadict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                    for w in words if w in self.word2vec] or
                   [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [None]:
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

rfc_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", RandomForestClassifier(n_estimators=20))])
rfc_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", RandomForestClassifier(n_estimators=20))])

In [None]:
rfc_w2v.fit(X_train, y_train)
pred = rfc_w2v.predict(X_test)

In [None]:
print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))
print(classification_report(y_test, pred))
labels = rfc_w2v.classes_


sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

### paragpaph2vec aka doc2vec

word2vec с дополнительной меткой id документа

In [None]:
from gensim.models.doc2vec import *

In [None]:
splitted_texts = [text.split() for text in X]
idx = [str(i) for i in range(len(X))]

docs = []
for i in range(len(X)):
    docs.append(TaggedDocument(splitted_texts[i], [idx[i]]))

model = Doc2Vec(vector_size=300, window=5, min_count=5, workers=12, alpha=.025, min_alpha=.01, dm=0)
model.build_vocab(docs)

# docvec1 = model.docvecs[0]
# docvecsyn1 = model.docvecs.doctag_syn0[0]
# docsim1 = model.docvecs.most_similar[id1]

model.train(docs, total_examples=len(docs), epochs=20)

In [None]:
class Doc2VecVectorizer(object):
    def __init__(self, d2v_model):
        self.d2v_model = d2v_model
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.d2v_model.infer_vector(text.split()) for text in X])
    

rfc_d2v = Pipeline([
    ('word2vec vectorizer', Doc2VecVectorizer(model)),
    ('extra trees', RandomForestClassifier(n_estimators=20))])

rfc_d2v.fit(X_train, y_train)
pred = rfc_d2v.predict(X_test)

In [None]:
print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))
print(classification_report(y_test, pred))
labels = rfc_w2v.classes_


sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

### fast text

Слово $w$ представляем символьными $n$-грамами: 

$n=3$, $G_{where} = \_wh, whe, her, re\_, \_where\_$

$sim_{w2v}(u,v) = <u,v>$


$sim_{ft}(u,v) = \sum_{e \in G_u} \sum_{g \in G_v} <e,v>$


https://github.com/facebookresearch/fasttext


In [None]:
import fasttext

with open('datasets/nlp/data.train.txt', 'w+') as outfile:
    for i in range(len(X_train)):
        outfile.write('__label__' + y_train[i] + ' ' + X_train[i] +  '\n')

with open('datasets/nlp/test.txt', 'w+') as outfile:
    for i in range(len(X_test)):
        outfile.write('__label__' + y_test[i] + ' ' + X_test[i] +  '\n')

In [None]:
classifier = fasttext.train_supervised('datasets/nlp/data.train.txt', lr=1.0, epoch=25)
result = classifier.test('datasets/nlp/test.txt')

def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*classifier.test('datasets/nlp/test.txt'))

In [None]:
for item in name_list:
    item = item.replace("\n"," ")
    pred = classifier.predict(item)


print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))
print(classification_report(y_test, pred))
labels = rfc_w2v.classes_


sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()