In [None]:
import multiprocessing as mp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import re
import math

from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu_to_use = max(1, mp.cpu_count() - 2)
print(f"Cores to use: {cpu_to_use}")

## Read data

In [None]:
df = pd.read_csv("data/train.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['category'].value_counts()

In [None]:
df_test = pd.read_csv("data/test.csv")

In [None]:
df_test.head()

## Preprocess data

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,

    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()

morph_tagger = NewsMorphTagger(NewsEmbedding())

In [None]:
class Document():
    def __init__(self, id_=0):
        self.id = id_
        self.word_counts = {}

In [None]:
def tokenize_documents(df):
    docs_tokens = []
    
    texts = df['text'].tolist()
    for text in tqdm(texts):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)

        tokens = []
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
            if re.match('[а-яa-z]+(-[а-яa-z]+)*$', token.lemma):
                tokens.append(token.lemma)
        docs_tokens.append(tokens)
    return docs_tokens

In [None]:
# get vocabulary and index
def get_vocab(docs_tokens):
    vocab = []
    word_index = {}

    for tokens in tqdm(docs_tokens):
        for word in tokens:
            if not word in word_index:
                vocab.append(word)
                word_index[word] = len(vocab) - 1
    return vocab, word_index

In [None]:
# calculate word counts for documents
def get_docs(docs_tokens, word_index):
    docs = []
    
    for i, tokens in enumerate(tqdm(docs_tokens)):
        doc = Document(i)
        doc.total = len(tokens)
        doc.word_counts[-1] = 0
        for token in tokens:
            if token not in word_index:
                doc.word_counts[-1] += 1
                continue
            idx = word_index[token]
            doc.word_counts.setdefault(idx, 0)
            doc.word_counts[idx] += 1
        docs.append(doc)

    return docs

In [None]:
gdf = df.groupby('oid').agg({
    'category'    : min, 
    'text'        : lambda x: ' '.join(x)},)
gdf.index = range(len(gdf))
gdf.head(5)

In [None]:
# get words for all documents
docs_words = tokenize_documents(gdf)

In [None]:
# get encoded labels for all documents
le = LabelEncoder()
docs_labels = le.fit_transform(gdf['category'].to_numpy())

In [None]:
# stratified train/val split
words_train, words_val, Y_train, Y_val = train_test_split(docs_words, docs_labels, test_size=0.2, stratify=docs_labels)

In [None]:
len(words_train), len(words_val)

In [None]:
# get vocabulary and index from train documents
vocab, word_index = get_vocab(words_train)
len(vocab)

In [None]:
# calculate word count stats for train and val documents
docs_train = get_docs(words_train, word_index)
docs_val = get_docs(words_val, word_index)

In [None]:
def get_doc_freq(vocab, docs):
    # calculate document frequency for words
    doc_freq = [0 for _ in range(len(vocab))]
    for doc in tqdm(docs):
        for i in doc.word_counts:
            doc_freq[i] += 1
    return doc_freq

In [None]:
def get_tf_idf(vocab, docs, doc_freq):  
    tf_idf = [[0 for _ in range(len(vocab))] for _ in range(len(docs))]
    for i in tqdm(range(len(docs))):
        for word in docs[i].word_counts:
            tf_idf[i][word] = docs[i].word_counts.get(word, 0)
            tf_idf[i][word] /= docs[i].total
            tf_idf[i][word] *= math.log(len(docs) / doc_freq[word])
    
    return tf_idf

In [None]:
doc_freq = get_doc_freq(vocab, docs_train)

In [None]:
tf_idf_train = get_tf_idf(vocab, docs_train, doc_freq)

In [None]:
tf_idf_val = get_tf_idf(vocab, docs_val, doc_freq)

In [None]:
len(tf_idf_train), len(tf_idf_train[0])

In [None]:
len(tf_idf_val), len(tf_idf_val[0])

### Classification

In [None]:
def get_scorer(threshold=0.1, confidence=1.01):
    def scorer(y, y_probas):
        score = 0.
        for i in range(len(y)):
            probas = np.sort(y_probas[i])
            if probas[-1] > threshold and probas[-1] > confidence * probas[-2]:
                max_ = probas[-1]
                label = np.where(y_probas[i] == probas[-1])[0]
                score += 1 if label == y[i] else -1
                
        return score / len(y)
    
    return scorer

#### SVD

In [None]:
matrix_train = csr_matrix(tf_idf_train)
matrix_val = csr_matrix(tf_idf_val)

In [None]:
svd = TruncatedSVD(n_components=1024, n_iter=10, random_state=42)
svd.fit(matrix_train)
svd.components_.shape

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(svd.transform(tf_idf_train))

X_val = scaler.transform(svd.transform(tf_idf_val))

#### Logistic regression

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
y_probas = clf.predict_proba(X_val)

In [None]:
score = get_scorer(0.1, 1.01)(Y_val, y_probas)
score