In [2]:
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

  from numpy.core.umath_tests import inner1d


## Подготовка данных и обучение моделей  

(это вариант без адаграмма, который слишком долго скачивается и обучается)

In [3]:
data_rt = pd.read_csv('news_texts.csv')
data_rt.dropna(inplace=True)

In [4]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [5]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [6]:
cv = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
X = cv.fit_transform(data_rt['content_norm'])

In [7]:
nmf = NMF(50)
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [8]:
svd = TruncatedSVD(50)
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [9]:
fast_text = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
def c_sim(x,y):
    sim=[]
    for i in range(len(x)):
        c_sim=cosine_similarity([x[i]], [y[i]])[0]
        sim.append(c_sim)
    return sim

In [25]:
X_text_1_svd = svd.transform(cv.transform(data['text_1_norm']))
X_text_2_svd = svd.transform(cv.transform(data['text_2_norm']))

X_text_svd = np.concatenate([X_text_1_svd, X_text_2_svd], axis=1)
svd_csim=c_sim(X_text_1_svd, X_text_2_svd)

In [29]:
X_text_1_nmf = nmf.transform(cv.transform(data['text_1_norm']))
X_text_2_nmf = nmf.transform(cv.transform(data['text_2_norm']))

X_text_nmf = np.concatenate([X_text_1_nmf, X_text_2_nmf], axis=1)
nmf_csim=c_sim(X_text_1_nmf, X_text_2_nmf)

In [30]:
def get_embedding(text, model, dim):
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue   
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))    
    return vector     

In [31]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  


In [32]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)
w2v_csim=c_sim(X_text_1_w2v, X_text_2_w2v)

In [33]:
dim = 50
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

  


In [34]:
X_text_ft = np.concatenate([X_text_1_ft, X_text_2_ft], axis=1)
ft_csim=c_sim(X_text_1_ft, X_text_2_ft)

In [42]:
X_data=np.concatenate((svd_csim, nmf_csim, w2v_csim, ft_csim), axis=-1)

In [43]:
X_data.shape

(7227, 200)

In [44]:
X_train, X_test, y_train, y_test=train_test_split(X_data, data['label'], train_size=7000)



## Тестирование

Сначала попробуем логистическую регрессию:

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [51]:
clf = LogisticRegression()
print(cross_val_score(clf,X_train,y_train,scoring="f1_micro"))

[0.44001714 0.44687232 0.43310463]


Теперь рандомный лес:

In [52]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
print(cross_val_score(clf,X_train,y_train,scoring="f1_micro"))

[0.44130249 0.41945159 0.44639794]


Можно видеть, что при использовании рандомного леса показатели несколько упали. Возможно, для улучшения результатов стоит использовать другой классификатор.  

Попробуем модель опорных векторов:

In [53]:
from sklearn import svm
clf=svm.LinearSVC()
print(cross_val_score(clf,X_train,y_train,scoring="f1_micro"))

[0.44087404 0.44944302 0.43524871]


Результаты чуть лучше, чем у логистической регрессии, но не намного. Попробуем изменить параметры:

In [54]:
clf=svm.LinearSVC(C=2, multi_class="crammer_singer")
print(cross_val_score(clf,X_train,y_train,scoring="f1_micro"))

[0.4361611  0.458012   0.44339623]


Заметно небольшое улучшение. Возможно, для дальнейшего совершенствования классификатора потребуется более длительный поиск оптимальных параметров.