In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

import re
import sys
import os

import numpy as np
from tqdm import tqdm
import pandas as pd
from pymystem3 import Mystem
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from gensim.models import word2vec
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import optuna


# Добавляем в path вышестоящую директорию для импорта  calc_metrics
sys.path.append(os.path.abspath(os.path.join("..")))

from utils import calc_metrics
from word2vec_utils import Word2VecVectorizer, Word2VecTfIdfVectorizer

RANDOM_STATE = 41825352

sns.set_style("whitegrid")
warnings.filterwarnings('ignore')

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X = pd.read_csv("../data/x.csv", index_col="date")
y = pd.read_csv("../data/y.csv", index_col="date").iloc[:, 0]

In [3]:
# инициализируем лемматизатор
mystem = Mystem()

# загружаем стоп-слова
STOP_WORDS = set(stopwords.words("russian"))


# функция для препроцессинга текста
def preprocessor(text):

    # приводим к нижнему регистру
    text = text.lower()

    # удаляем все символы, кроме пробелов и русских букв.
    regex = re.compile("[^а-я А-ЯЁё]")
    text = regex.sub(" ", text)

    # лемматизируем тексты
    text = " ".join(mystem.lemmatize(text))

    # удаляем стоп-слова
    text = " ".join([word for word in text.split() if word not in STOP_WORDS])
    return text


Xpreproc = X.release.apply(preprocessor)

In [351]:
from gensim.models import word2vec

PADDING_TOKEN = '<pad>'
EMBEDDING_SIZE = 50

data = Xpreproc.str.split().apply(lambda x: x +[PADDING_TOKEN])
w2v = word2vec.Word2Vec(data, window=5, workers=1, sg=0, vector_size=EMBEDDING_SIZE, seed=RANDOM_STATE)

max_len = data.apply(len).max()
padding_idx = w2v.wv.get_index(PADDING_TOKEN)

In [353]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gensim
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/#31-create-cnn-model

class CnnTextClassifier(nn.Module):
    def __init__(self, num_classes, window_sizes=(2,3,5)):
        super(CnnTextClassifier, self).__init__()
        w2vmodel = w2v #gensim.models.KeyedVectors.load('w2v.model')
        weights = w2vmodel.wv
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(weights.vectors),
            padding_idx=w2vmodel.wv.get_index(PADDING_TOKEN)
        )
        
        self.convs = nn.ModuleList([
            nn.Conv2d(1, 64, [window_size, EMBEDDING_SIZE], padding=(window_size - 1, 0))
            for window_size in window_sizes
        ])

        self.fc = nn.Linear(64 * len(window_sizes), num_classes)
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, x):
        x = self.embedding(x)

        if len(x.size()) < 4:
            x = torch.unsqueeze(x, 1)
        xs = []
        for conv in self.convs:
            x2 = torch.relu(conv(x))
            x2 = torch.squeeze(x2, -1)
            x2 = F.max_pool1d(x2, x2.size(2))
            xs.append(x2)
        x = torch.cat(xs, 2)

        x = x.view(x.size(0), -1)
        logits = self.fc(self.dropout(x))

        probs = F.softmax(logits, dim=1)

        return probs

In [354]:
cnn_model = CnnTextClassifier(num_classes=3)
cnn_model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.0001)
num_epochs = 100

In [355]:
cnn_model.train()

vec = torch.stack(data.apply(make_word2vec_vector_cnn).tolist()).squeeze(1)
probs = cnn_model(vec)

In [356]:
def make_target(label):
    return torch.tensor([int(label)+1], dtype=torch.long, device=device)

def make_word2vec_vector_cnn(sentence):
    padded_X = [padding_idx for i in range(max_len)]
    i = 0
    for word in sentence:
        if word not in w2v.wv:
            padded_X[i] = 0
        else:
            padded_X[i] = w2v.wv.get_index(word)
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device).view(1, -1)

In [357]:
data1 = torch.stack(data.apply(make_word2vec_vector_cnn).tolist()).squeeze(1).to(device)
y1 = torch.stack(y.apply(make_target).tolist()).squeeze().to(device)

In [358]:
num_epochs = 1000

y_preds = []
y_preds_proba = []

prog = tqdm(range(30, len(y)))
for threshold in prog:
    cnn_model = CnnTextClassifier(num_classes=3)
    cnn_model.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn_model.parameters(), lr=0.001, fused=True)
    #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1500])
    
    cnn_model.train()
    for epoch in range(num_epochs):
        cnn_model.zero_grad()
        bow_vec = data1[:threshold]
        probs = cnn_model(bow_vec)
        target = y1[:threshold]
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
        #scheduler.step()
        if epoch % 20 == 0:
            prog.set_postfix_str(f'loss = {loss.item()}, lr = {optimizer.param_groups[0]['lr']}')

    cnn_model.eval()
    bow_vec = make_word2vec_vector_cnn(data[threshold])
    probs = cnn_model(bow_vec)
    pred = torch.argmax(probs) - 1
    y_preds_proba.append(probs.detach().cpu().numpy())
    y_preds.append(pred.detach().cpu().numpy().item())

100%|██████████| 69/69 [07:37<00:00,  6.63s/it, loss = 0.554999589920044, lr = 0.001] 


In [359]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)

accuracy_score(y_preds, y[30:])

0.5507246376811594

In [360]:
roc_auc_score(
    y[30:],
    np.concatenate(y_preds_proba, axis=0),
    average="macro",
    multi_class="ovo",
)

0.7067971862022205

In [361]:
print(classification_report(y_preds, y[30:]))

              precision    recall  f1-score   support

          -1       0.57      0.59      0.58        22
           0       0.59      0.47      0.52        34
           1       0.47      0.69      0.56        13

    accuracy                           0.55        69
   macro avg       0.54      0.58      0.55        69
weighted avg       0.56      0.55      0.55        69



In [362]:
confusion_matrix(y_preds, y[30:])

array([[13,  8,  1],
       [ 9, 16,  9],
       [ 1,  3,  9]], dtype=int64)