In [92]:
import os
import pandas as pd
import pickle
from string import punctuation as punct

import numpy as np
from lxml import html
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec, FastText, KeyedVectors

from mystem import MyStem

В рамках нормализации для всех данных использовал лемматизацию и токенизацию с помощью mystem

In [2]:
punct += '«»–—'
MYSTEM = MyStem()

In [3]:
def strip_punct(x):
    return ' '.join([w.strip(punct) for w in x.split()])

Корпус новостей

In [10]:
TRAIN_DATA = pd.concat(
    [pd.read_json(open(os.path.join('data_kw', f_name), encoding='utf-8'), lines=True)
     for f_name in os.listdir('data_kw') if f_name.startswith('ru')
    ]
)

In [11]:
TRAIN_DATA.shape

(7217, 5)

In [20]:
TRAIN_DATA.content = TRAIN_DATA.content.apply(strip_punct)

In [22]:
%%time
content_lem = MYSTEM.run(list(TRAIN_DATA.content.values), flags='-idln')

Wall time: 2min 59s


In [26]:
TRAIN_DATA['content_lem'] = content_lem

In [2]:
with open('TRAIN_DATA', 'rb') as f:
    TRAIN_DATA = pickle.load(f)

Данные парафраза

In [6]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
PARA = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [7]:
PARA.label = PARA.label.map({'0': 1, '1': 2, '-1': 0})

In [8]:
PARA['text_1'] = PARA.text_1.apply(strip_punct)
PARA['text_2'] = PARA.text_2.apply(strip_punct)

In [9]:
%%time
PARA['text_norm_1'] = MYSTEM.run(list(PARA.text_1.values), flags='-idln')
PARA['text_norm_2'] = MYSTEM.run(list(PARA.text_2.values), flags='-idln')

Wall time: 5.68 s


In [60]:
with open('PARA_DATA', 'rb') as f:
    PARA_DATA = pickle.load(f)

Построение моделей

In [4]:
CVECT_NORM = CountVectorizer()
TFIDF_NORM = TfidfVectorizer()

CVECT = CountVectorizer()
TFIDF = TfidfVectorizer()

In [5]:
%%time

CVECT_NORM = CVECT_NORM.fit(TRAIN_DATA.content_lem)
TFIDF_NORM = TFIDF_NORM.fit(TRAIN_DATA.content_lem)

CVECT = CVECT.fit(TRAIN_DATA.content)
TFIDF = TFIDF.fit(TRAIN_DATA.content)

Wall time: 17.9 s


In [6]:
TRAIN_DATA_CVECT_NORM = CVECT_NORM.transform(TRAIN_DATA.content_lem)
TRAIN_DATA_TFIDF_NORM = TFIDF_NORM.transform(TRAIN_DATA.content_lem)

TRAIN_DATA_CVECT = CVECT.transform(TRAIN_DATA.content)
TRAIN_DATA_TFIDF = TFIDF.transform(TRAIN_DATA.content)

Возьмем общий размер вектора 128 для всех моделей.  
Для более традиционных размеров 300 (или более) доступных данных, скорее всего, будет недостаточно, что приведет к слишком большому числу обучаемых параметров в соотношении с количеством объектов (для W2V и FastText)  

In [108]:
%%time
SVD_CVECT = TruncatedSVD(128).fit(TRAIN_DATA_CVECT_NORM)
SVD_TFIDF = TruncatedSVD(128).fit(TRAIN_DATA_TFIDF_NORM)

Wall time: 15.3 s


In [33]:
%%time
NMF_CVECT = NMF(128).fit(TRAIN_DATA_CVECT_NORM)
NMF_TFIDF = NMF(128).fit(TRAIN_DATA_TFIDF_NORM)

Wall time: 17min 42s


In [8]:
with open('NMF_CVECT', 'rb') as f:
    NMF_CVECT = pickle.load(f)
    
with open('NMF_TFIDF', 'rb') as f:
    NMF_TFIDF = pickle.load(f)

In [17]:
%%time
W2V = Word2Vec([x.split() for x in TRAIN_DATA.content_lem.values], iter=30, size=128)

Wall time: 1min 46s


In [21]:
W2V.wv.save('W2V.kv')

In [9]:
W2V = KeyedVectors.load('W2V.kv')

In [18]:
W2V.most_similar(positive=['сообщение'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('заявление', 0.724164605140686),
 ('информация', 0.7203260660171509),
 ('сведение', 0.6698870062828064),
 ('пресс-релиз', 0.6675968170166016),
 ('материал', 0.6334848999977112),
 ('данные', 0.5983684062957764),
 ('отчет', 0.5942883491516113),
 ('уведомление', 0.5902847051620483),
 ('релиз', 0.5874637365341187),
 ('документ', 0.5692962408065796)]

In [19]:
%%time
FT_NORM = FastText(
    [x.split() for x in TRAIN_DATA.content_lem.values],
    iter=30,
    size=128,
    min_n=2,
    max_n=4
)

Wall time: 8min 52s


In [27]:
FT_NORM.wv.save('FT_NORM.kv')

In [10]:
FT_NORM = KeyedVectors.load('FT_NORM.kv')

In [20]:
FT_NORM.most_similar(positive=['сообщение'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('авиасообщение', 0.8739902973175049),
 ('обобщение', 0.8208562135696411),
 ('заявление', 0.8122040033340454),
 ('разобщение', 0.8047382235527039),
 ('уведомление', 0.7661752104759216),
 ('информация', 0.7655895352363586),
 ('общение', 0.7631301879882812),
 ('сообща', 0.759331226348877),
 ('информирование', 0.7436956167221069),
 ('сведение', 0.7395753860473633)]

In [28]:
%%time
FT = FastText(
    [x.split() for x in TRAIN_DATA.content.values],
    iter=30,
    size=128,
    min_n=2,
    max_n=4
)

Wall time: 9min 17s


In [30]:
FT.wv.save('FT.kv')

In [11]:
FT = KeyedVectors.load('FT.kv')

In [29]:
FT.most_similar(positive=['сообщение'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('Сообщение', 0.9153209924697876),
 ('авиасообщение', 0.9104920029640198),
 ('заявление', 0.8764517903327942),
 ('общение', 0.8676607012748718),
 ('уведомление', 0.8466607928276062),
 ('оповещение', 0.8172410726547241),
 ('видеообращение', 0.8165761232376099),
 ('освещение', 0.8085266351699829),
 ('кровообращение', 0.8066942691802979),
 ('очищение', 0.805307149887085)]

Функционал для сборки датасета

In [53]:
def get_vectorizer_map(vectorizer):
    return {w: i for i, w in enumerate(vectorizer.get_feature_names())}


def get_tfidf_weights(text, tfidf, tfidf_map):
    vector = tfidf.transform([text]).toarray()[0]
    
    return {w: vector[tfidf_map[w]] if w in tfidf_map else 1.0 for w in text.split()}
    

def get_w2v_embedding(text, model, tfidf=None, tfidf_map=None):
    vectors = list()
    weights = get_tfidf_weights(text, tfidf, tfidf_map) if tfidf is not None else {w: 1.0 for w in text.split()}
    
    for w in text.split():
        if w in model:
            vectors.append(model[w] * weights[w])
    
    return np.mean(vectors, axis=0) if vectors else np.random.normal(size=(model.vector_size,))


def compute_decomposition_features(para, models, vectorizers):
    X = np.zeros((para.shape[0], len(models)))
    i = 0
    
    for model, vectorizer in zip(models, vectorizers):
        V1 = model.transform(vectorizer.transform(para.text_norm_1))
        V2 = model.transform(vectorizer.transform(para.text_norm_2))
        
        X[:,i] = np.diag(cosine_similarity(V1, V2))
        i += 1
        
    return X


def compute_w2v_features(para, model, tfidf, tfidf_map):
    V1 = np.array([get_w2v_embedding(x, model) for x in para.text_norm_1])
    V2 = np.array([get_w2v_embedding(x, model) for x in para.text_norm_2])
    
    VW1 = np.array([get_w2v_embedding(x, model, tfidf, tfidf_map) for x in para.text_norm_1])
    VW2 = np.array([get_w2v_embedding(x, model, tfidf, tfidf_map) for x in para.text_norm_2])
    
    return np.hstack([np.diag(cosine_similarity(V1, V2)).reshape(-1, 1),
                      np.diag(cosine_similarity(VW1, VW2)).reshape(-1, 1)
                     ]
    )
    
    
def compute_ft_features(para, model, model_norm, tfidf, tfidf_map, tfidf_norm, tfidf_norm_map):
    V1 = np.array([get_w2v_embedding(x, model) for x in para.text_1])
    V2 = np.array([get_w2v_embedding(x, model) for x in para.text_2])
    
    VW1 = np.array([get_w2v_embedding(x, model, tfidf, tfidf_map) for x in para.text_1])
    VW2 = np.array([get_w2v_embedding(x, model, tfidf, tfidf_map) for x in para.text_2])
    
    V1_NORM = np.array([get_w2v_embedding(x, model_norm) for x in para.text_norm_1])
    V2_NORM = np.array([get_w2v_embedding(x, model_norm) for x in para.text_norm_2])
    
    VW1_NORM = np.array([get_w2v_embedding(x, model_norm, tfidf_norm, tfidf_norm_map) for x in para.text_norm_1])
    VW2_NORM = np.array([get_w2v_embedding(x, model_norm, tfidf_norm, tfidf_norm_map) for x in para.text_norm_2])
    
    return np.hstack([np.diag(cosine_similarity(V1, V2)).reshape(-1, 1),
                      np.diag(cosine_similarity(VW1, VW2)).reshape(-1, 1),
                      np.diag(cosine_similarity(V1_NORM, V2_NORM)).reshape(-1, 1),
                      np.diag(cosine_similarity(VW1_NORM, VW2_NORM)).reshape(-1, 1)
                     ]
    )

In [20]:
TFIDF_MAP = get_vectorizer_map(TFIDF)
TFIDF_NORM_MAP = get_vectorizer_map(TFIDF_NORM)

In [67]:
%%time
X_decomp = compute_decomposition_features(
    para=PARA_DATA,
    models=[SVD_CVECT,SVD_TFIDF, NMF_CVECT, NMF_TFIDF],
    vectorizers=[CVECT_NORM, TFIDF_NORM, CVECT_NORM, TFIDF_NORM]
)

Wall time: 7.75 s


In [66]:
X_decomp.shape

(7227, 4)

In [79]:
%%time
X_w2v = compute_w2v_features(PARA_DATA, W2V, TFIDF_NORM, TFIDF_NORM_MAP)

Wall time: 8.92 s


In [74]:
X_w2v.shape

(7227, 2)

In [77]:
%%time
X_ft = compute_ft_features(
    PARA_DATA, FT, FT_NORM, TFIDF, TFIDF_MAP, TFIDF_NORM, TFIDF_NORM_MAP
)

Wall time: 27.7 s


In [78]:
X_ft.shape

(7227, 4)

In [80]:
X = np.hstack([X_decomp, X_w2v, X_ft])

In [81]:
X.shape

(7227, 10)

Посмотрим на logreg и RandomForest

In [83]:
lr = LogisticRegressionCV(cv=5, scoring='f1_micro', class_weight='balanced')

In [91]:
cross_val_score(lr, X, PARA_DATA.label, scoring='f1_micro', cv=5).mean()

0.5058633943894317

In [97]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=100, 
                            max_features=None, criterion='entropy',
                            n_jobs=3, class_weight='balanced'
)

In [98]:
cross_val_score(rf, X, PARA_DATA.label, scoring='f1_micro', cv=5).mean()

0.5506822575383388

Как видно, качество не самое высокое, однако лучше случайного ответа.  
Вероятно, перебор гиперпараметров поможет несколько улучшить положение.  
Далее я произведу подбор гиперпараметров по минимальной схеме: 5 параметров по паре значений

In [102]:
def run(models, est):
    X_decomp = compute_decomposition_features(
        para=PARA_DATA,
        models=[models['SVD_CVECT'], models['SVD_TFIDF'], models['NMF_CVECT'], models['NMF_TFIDF']],
        vectorizers=[CVECT_NORM, TFIDF_NORM, CVECT_NORM, TFIDF_NORM]
    )

    X_w2v = compute_w2v_features(PARA_DATA, models['W2V'], TFIDF_NORM, TFIDF_NORM_MAP)

    X_ft = compute_ft_features(
        PARA_DATA, models['FT'], models['FT_NORM'], TFIDF, TFIDF_MAP, TFIDF_NORM, TFIDF_NORM_MAP
    )
    
    X = np.hstack([X_decomp, X_w2v, X_ft])
    
    return cross_val_score(est, X, PARA_DATA.label, scoring='f1_micro', cv=5).mean()


MODELS = {
    'SVD_CVECT': SVD_CVECT,
    'SVD_TFIDF': SVD_TFIDF,
    'NMF_CVECT': NMF_CVECT,
    'NMF_TFIDF': NMF_TFIDF,
    'W2V': W2V,
    'FT_NORM': FT_NORM,
    'FT': FT
}

Увеличим кол-во компонент для `SVD`, рассмотрев значения 128 (уже построено выше) и 256

In [105]:
SVD_CVECT2 = TruncatedSVD(256).fit(CVECT_NORM.transform(TRAIN_DATA.content_lem))
SVD_TFIDF2 = TruncatedSVD(256).fit(TFIDF_NORM.transform(TRAIN_DATA.content_lem))

In [106]:
MODELS.update({'SVD_CVECT': SVD_CVECT2, 'SVD_TFIDF': SVD_TFIDF2})

In [107]:
%%time
run(MODELS, lr)

Wall time: 49 s


0.5036484773200627

In [109]:
MODELS.update({'SVD_CVECT': SVD_CVECT, 'SVD_TFIDF': SVD_TFIDF})

С 256 компонентами прирост получить не удалось.

Для `W2V` увеличим размер векторов до 256 (размера 128 уже построены), а затем рассмотрим оба алгоритма: `cbow` и `skipgram`

In [110]:
%%time
W2V2 = Word2Vec([x.split() for x in TRAIN_DATA.content_lem.values], iter=30, size=256)

Wall time: 2min 22s


In [111]:
MODELS.update({'W2V': W2V2})

In [112]:
%%time
run(MODELS, lr)

  app.launch_new_instance()


Wall time: 48.6 s


0.5080754403119105

С размером векторов в 256 есть прирост + 0.003

In [114]:
%%time
W2V3 = Word2Vec([x.split() for x in TRAIN_DATA.content_lem.values], iter=20, sg=1, size=256)

Wall time: 5min 28s


In [115]:
MODELS.update({'W2V': W2V3})

In [116]:
%%time
run(MODELS, lr)

  app.launch_new_instance()


Wall time: 48.6 s


0.49977342700427985

In [117]:
MODELS.update({'W2V': W2V2})

Однако смена алгоритма с `cbow` на `skipgram` ухудшила качество, что соотносится с рекомендациями Миколова и Ко использовать `cbow` на не достаточно больших данных

Наконец, рассмотрим гипепараметры `min_n`, а затем `max_n` для FastText на ненормализованных данных

In [119]:
%%time
FT2 = FastText(
    [x.split() for x in TRAIN_DATA.content.values],
    iter=20,
    size=128,
    min_n=2,
    max_n=5
)

Wall time: 7min 3s


In [120]:
MODELS.update({'FT': FT2})

In [121]:
%%time
run(MODELS, lr)

  app.launch_new_instance()


Wall time: 52.9 s


0.5075225723091462

In [122]:
%%time
FT3 = FastText(
    [x.split() for x in TRAIN_DATA.content.values],
    iter=20,
    size=128,
    min_n=3,
    max_n=5
)

Wall time: 5min 16s


In [123]:
MODELS.update({'FT': FT3})

In [124]:
%%time
run(MODELS, lr)

  app.launch_new_instance()


Wall time: 49 s


0.5030927413499728

In [125]:
%%time
FT4 = FastText(
    [x.split() for x in TRAIN_DATA.content.values],
    iter=20,
    size=256,
    min_n=2,
    max_n=4
)

Wall time: 9min 22s


In [126]:
MODELS.update({'FT': FT4})

In [127]:
%%time
run(MODELS, lr)

  app.launch_new_instance()


Wall time: 56.2 s


0.509874750257616

Использование векторов размера 256 и n-грамм от 2 до 4 позволило получить еще небольшой прирост

Итого, по итогам такого условного перебора удалось добиться прироста качества +~0.005.  

Рассмотренные гиперпараметры:  
1. SVD.n_components: __128__, 256
2. W2V.size: 128, __256__
3. W2V.sg: __0__, 1
4. FastText.min_n: __2__, 3
4. FastText.max_n: __4__, 5