In [85]:
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity


tfidf = TfidfVectorizer()
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)



In [2]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [3]:
data.head()

Unnamed: 0,label,text_1,text_2
0,0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...
1,0,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...
2,0,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...
3,-1,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...
4,0,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...


In [4]:
data_rt = pd.read_csv('news_texts.csv')

In [5]:
data_rt.head()

Unnamed: 0,content,content_norm
0,Канцлер Германии Ангела Меркель в ходе брифинг...,канцлер германия ангел меркель ход брифинг пре...
1,Российские и белорусские войска успешно заверш...,российский белорусский войско успешно завершит...
2,"Дзюба, Шатов и Анюков оказались не нужны «Зени...",дзюба шат анюк оказаться нужный зенит российск...
3,"В Испанию без фанатов\nПожалуй, главной пятнич...",испания фанат пожалуй главный пятничный новост...
4,"Постпред России при ООН Виталий Чуркин, говоря...",постпред россия оон виталий чуркин говорить ве...


In [6]:
data_rt.dropna(inplace=True)

In [7]:
cv = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
X_cv = cv.fit_transform(data_rt['content_norm'])

In [8]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
X_tv = tfidf.fit_transform(data_rt['content_norm'])

In [9]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

### SVD

#### CountVectorizer

In [10]:
svd_cv = TruncatedSVD(50)
svd_cv.fit(X_cv)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [11]:
X_text_1_svd_cv = svd_cv.transform(cv.transform(data['text_1_norm']))
X_text_2_svd_cv = svd_cv.transform(cv.transform(data['text_2_norm']))

X_svd_cv = np.concatenate([X_text_1_svd_cv, X_text_2_svd_cv], axis=1)

#### cos

In [91]:
cos_svd_cv = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_svd_cv, X_text_2_svd_cv)]


#### TfidfVectorizer

In [12]:
svd_tv = TruncatedSVD(50)
svd_tv.fit(X_tv)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [14]:
X_text_1_svd_tv = svd_tv.transform(tfidf.transform(data['text_1_norm']))
X_text_2_svd_tv = svd_tv.transform(tfidf.transform(data['text_2_norm']))

X_svd_tv = np.concatenate([X_text_1_svd_tv, X_text_2_svd_tv], axis=1)

#### cos

In [92]:
cos_svd_tv = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_svd_tv, X_text_2_svd_tv)]


### NMF

#### CountVectorizer

In [16]:
nmf_cv = NMF(50)
nmf_cv.fit(X_cv)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [17]:
X_text_1_nmf_cv = nmf_cv.transform(cv.transform(data['text_1_norm']))
X_text_2_nmf_cv = nmf_cv.transform(cv.transform(data['text_2_norm']))

In [18]:
X_nmf_cv = np.concatenate([X_text_1_nmf_cv, X_text_2_nmf_cv], axis=1)

#### cos

In [93]:
cos_nmf_cv = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_nmf_cv, X_text_2_nmf_cv)]


#### TfidfVectorizer

In [19]:
nmf_tv = NMF(50)
nmf_tv.fit(X_tv)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [20]:
X_text_1_nmf_tv = nmf_tv.transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf_tv = nmf_tv.transform(tfidf.transform(data['text_2_norm']))

In [21]:
X_nmf_tv = np.concatenate([X_text_1_nmf_tv, X_text_2_nmf_tv], axis=1)

In [94]:
cos_nmf_tv = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_nmf_tv, X_text_2_nmf_tv)]


### Word2Vec

In [24]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)

#### c tfidf

In [25]:
def get_embedding_tfidf(text, model, dim):
    text = text.split()
    vocabulary = tfidf.vocabulary_
    array = tfidf.transform([' '.join(text)]).toarray()[0]    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(array[vocabulary[word]]) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [26]:
dim = 50
X_text_1_w2v_tfidf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v_tfidf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v_tfidf[i] = get_embedding_tfidf(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v_tfidf[i] = get_embedding_tfidf(text, w2v, dim)

  del sys.path[0]


In [27]:
X_text_w2v_tfidf = np.concatenate([X_text_1_w2v_tfidf, X_text_2_w2v_tfidf], axis=1)

#### cos

In [95]:
cos_w2v_tfidf = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_w2v_tfidf, X_text_2_w2v_tfidf)]


#### без tfidf

In [28]:
def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [29]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  if sys.path[0] == '':


In [30]:
X_text_w2v = np.concatenate([X_text_1_w2v_tfidf, X_text_2_w2v], axis=1)

#### cos

In [96]:
cos_w2v = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_w2v, X_text_2_w2v)]


### Fastext

#### с нормализацией

In [32]:
fast_text_norm = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)

#### с tfidf

In [33]:
dim = 50
X_text_1_ft_norm_tfidf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_norm_tfidf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_norm_tfidf[i] = get_embedding_tfidf(text, fast_text_norm, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_norm_tfidf[i] = get_embedding_tfidf(text, fast_text_norm, dim)

  del sys.path[0]


In [34]:
X_text_ft_norm_tfidf = np.concatenate([X_text_1_ft_norm_tfidf, X_text_2_ft_norm_tfidf], axis=1)

#### cos

In [97]:
cos_ft_norm_tfidf = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_ft_norm_tfidf, X_text_2_ft_norm_tfidf)]


#### без tfidf

In [35]:
dim = 50
X_text_1_ft_norm = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_norm = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_norm[i] = get_embedding(text, fast_text_norm, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_norm[i] = get_embedding(text, fast_text_norm, dim)

  if sys.path[0] == '':


In [36]:
X_text_ft_norm = np.concatenate([X_text_1_ft_norm, X_text_2_ft_norm], axis=1)

#### cos

In [98]:
cos_ft_norm = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_ft_norm, X_text_2_ft_norm)]


#### без нормализации

In [38]:
fast_text_notnorm = gensim.models.FastText([text.split() for text in data_rt['content']], size=50, min_n=4, max_n=8)

In [39]:
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

#### c tfidf

In [40]:
dim = 50

X_text_1_ft_notnorm_tfidf = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft_notnorm_tfidf = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft_notnorm_tfidf[i] = get_embedding_tfidf(text, fast_text_notnorm, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft_notnorm_tfidf[i] = get_embedding_tfidf(text, fast_text_notnorm, dim)

  del sys.path[0]


In [41]:
X_text_ft_notnorm_tfidf = np.concatenate([X_text_1_ft_notnorm_tfidf, X_text_2_ft_notnorm_tfidf], axis=1)

#### cos

In [99]:
cos_ft_notnorm_tfidf = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_ft_notnorm_tfidf, X_text_2_ft_notnorm_tfidf)]


#### без tfidf

In [42]:
dim = 50

X_text_1_ft_notnorm = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft_notnorm = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft_notnorm[i] = get_embedding(text, fast_text_notnorm, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft_notnorm[i] = get_embedding(text, fast_text_notnorm, dim)

  if sys.path[0] == '':


In [43]:
X_text_ft_notnorm = np.concatenate([X_text_1_ft_notnorm, X_text_2_ft_notnorm], axis=1)

#### cos

In [100]:
cos_ft_notnorm = [float(cosine_similarity([x], [y])) for x, y in zip(X_text_1_ft_notnorm, X_text_2_ft_notnorm)]


#### Постройте обучающую выборку из этих близостей. Обучите любую модель (Логрег, Рандом форест или что-то ещё) на этой выборке и оцените качество на кросс-валидации (используйте микросреднюю f1-меру).   

In [101]:
data = {'cos_svd_cv': cos_svd_cv, 
     'cos_nmf_cv': cos_nmf_cv, 
     'cos_svd_tv': cos_svd_tv, 
     'cos_nmf_tv': cos_nmf_tv, 
     'cos_w2v': cos_w2v, 
     'cos_w2v_tfidf': cos_w2v_tfidf, 
     'cos_ft_norm': cos_ft_norm, 
     'cos_ft__norm_tfidf': cos_ft_norm_tfidf, 
     'cos_ft_notnorm': cos_ft_notnorm,
     'cos_ft_notnorm_tfidf': cos_ft_notnorm_tfidf}

In [102]:
X = pd.DataFrame(data)

In [103]:
X.head()

Unnamed: 0,cos_ft__norm_tfidf,cos_ft_norm,cos_ft_notnorm,cos_ft_notnorm_tfidf,cos_nmf_cv,cos_nmf_tv,cos_svd_cv,cos_svd_tv,cos_w2v,cos_w2v_tfidf
0,0.290462,0.760473,0.912951,0.988519,0.469175,0.40723,0.20752,0.274143,0.919591,0.492916
1,0.583848,0.808237,0.886812,0.0,0.324605,0.803896,0.419983,0.388221,0.925003,0.655095
2,0.769127,0.840458,0.951643,0.644112,0.020419,0.217881,0.2325,0.457799,0.954454,0.945116
3,0.788949,0.604869,0.826321,1.0,0.464586,0.500415,0.71687,0.583495,0.737108,0.848862
4,0.936233,0.704407,0.536743,1.0,0.990953,0.999454,0.921958,0.983609,0.926154,0.948337


In [104]:
def result(X_text, y):
    train_X, valid_X, train_y, valid_y = train_test_split(X_text, y, random_state=1)
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=10,
                             class_weight='balanced')
    clf.fit(train_X, train_y)
    preds = clf.predict(valid_X)
    print('test', metrics.f1_score(valid_y, preds, average='micro'))

In [105]:
result(X, y)

test 0.5489762036524627


In [106]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=10,
                             class_weight='balanced')

In [107]:
np.mean(cross_val_score(clf, X, y, cv=5, scoring=metrics.make_scorer(metrics.f1_score, average='micro')))

0.547642636115325