In [None]:
import multiprocessing as mp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import re
import math
import logging
import pickle

from scipy.special import softmax
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from pathlib import Path

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu_to_use = max(1, mp.cpu_count() - 2)
print(f"Cores to use: {cpu_to_use}")

## Read data

In [None]:
df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

## natasha tokenize

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,

    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()

morph_tagger = NewsMorphTagger(NewsEmbedding())

In [None]:
def filter_texts(df, drop_rate=None):
    docs_tokens = []
    
    texts = df['text'].tolist()
    for text in tqdm(texts):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)

        if drop_rate:
            mask_drop = np.random.rand(len(doc.tokens))
            mask_drop[mask_drop > drop_rate] = 0
            mask_drop[mask_drop != 0] = 1        
        
        tokens = []
        for i, token in enumerate(doc.tokens):
            token.lemmatize(morph_vocab)
            if re.match('[а-яa-z]+(-[а-яa-z]+)*$', token.lemma):
                # Drop words
                if not drop_rate or mask_drop[i] == 0:
                    tokens.append(token.lemma)

        docs_tokens.append(tokens)
    
    return docs_tokens

## evaluating scripts

In [None]:
def get_scorer(threshold=0.20, confidence=1.2):
    def scorer(y, y_probas):
        score = 0.
        for i in range(len(y)):
            probas = np.sort(y_probas[i])
            if probas[-1] > threshold and probas[-1] > confidence * probas[-2]:
                max_ = probas[-1]
                label = np.where(y_probas[i] == probas[-1])[0]
                score += 1 if label == y[i] else -1
                
        return score / len(y)
    
    return scorer

In [None]:
def evaluate(raw_outputs, eval_df, coefs=None, out_dict=False):
    median_outputs = np.array([output.sum(axis=0) for output in raw_outputs])
    
    probs = softmax(median_outputs, axis=1)
    if not coefs is None:
        probs *= coefs
    score = get_scorer(0.0, 1.)(eval_df['label'].to_list(), probs)
    
    report = classification_report(eval_df['label'].to_list(), predictions, output_dict=out_dict)
    
    return score, report

## Create Dataset for simpletransformers

In [None]:
def group_df(df):
    grouped_df = df.groupby('oid').agg({
        'category'    : min,
        'text'        : lambda texts : ' '.join(texts)})
    grouped_df.index = range(len(grouped_df))
    
    grouped_df.columns = ['label', 'text']
    grouped_df = grouped_df[['text', 'label']]
    
    grouped_df["label"] = grouped_df["label"].astype("category")
    cat_dict = dict(enumerate(grouped_df["label"].cat.categories))
    grouped_df["label"] = grouped_df["label"].cat.codes
    
    return grouped_df, cat_dict

### Group data and map labels

In [None]:
gdf, cat_dict = group_df(df)
num_classes = len(cat_dict)
gdf.head()

### Train-val split

In [None]:
train_df, eval_df = train_test_split(gdf, test_size=0.2, stratify = gdf['label'])
len(train_df), len(eval_df)

### Normalize val

In [None]:
docs_words = filter_texts(eval_df, drop_rate=None)

In [None]:
eval_df["text"] = [" ".join(words) for words in docs_words]
eval_df.head(3)

### Augment train data

In [None]:
# filter tokens (valid words)
docs_words = filter_texts(train_df, drop_rate=None)
docs_words_dropped = filter_texts(train_df, drop_rate=0.5)

In [None]:
_ = plt.hist([len(words) for words in docs_words])
_ = plt.hist([len(words) for words in docs_words_dropped])

### Spam words

In [None]:
vocab = {}
for words in docs_words:
    for word in words:
        vocab.setdefault(word, 0)
        vocab[word] += 1

In [None]:
vocab = dict(reversed(sorted(vocab.items(), key=lambda item: item[1])))
list(vocab.items())[:10]

In [None]:
p = re.compile('.*token.*')
SPAM = set([word for word in vocab.keys() if p.match(word)])

In [None]:
useless = ['в', 'и', 'на', 'с', 'быть', 'что', 'я', 'по', 'это', 'весь', 'он', 'мы', 'за', 'тот', 'для', 'а', 'из', 
    'но', 'который', 'как', 'этот', 'к', 'у', 'о', 'от', 'до', 'уже', 'еще', 'чтобы', 'кто', 'или', 'только', 'такой', 
    'при', 'когда', 'же', 'бы', 'также', 'какой', 'то', 'даже', 'под', 'ли', 'вот', 'потому', 'чем', 'перед', 'пока', 'там']

In [None]:
for ul in useless:
    SPAM.add(ul)

In [None]:
def filter_words(words):
    filtered = []
    for word in words:
        if not word in SPAM:
            filtered.append(word)
            
    return filtered

### Build train DataFrame

In [None]:
train_dropped = train_df.copy()
train_df["text"] = [" ".join(filter_words(words)) for words in docs_words]
train_dropped["text"] = [" ".join(filter_words(words)) for words in docs_words_dropped]
train_df = pd.concat([train_df, train_dropped], ignore_index=True)

### Save train and val DataFrames

In [None]:
filepath = Path('cache_dfs/train_df.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
train_df.to_csv(filepath, index=True)

filepath = Path('cache_dfs/eval_df.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
eval_df.to_csv(filepath, index=True)

with open('cache_dfs/cats.pkl', 'wb') as f:
    pickle.dump(cat_dict, f)

### Load train and val DataFrames

In [None]:
train_df = pd.read_csv('cache_dfs/train_df.csv', index_col=0)
eval_df = pd.read_csv('cache_dfs/eval_df.csv', index_col=0)

with open('cache_dfs/cats.pkl', 'rb') as f:
    cat_dict = pickle.load(f)

In [None]:
train_df.head(3)

In [None]:
eval_df.head(3)

## Bootstrap

In [None]:
n = 6

In [None]:
subsets = []

In [None]:
for _ in range(n):
    sub_train_df = train_df.sample(len(train_df), replace=True)
    subsets.append(sub_train_df)

In [None]:
len(subsets)

# Transformers

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
models = []

In [None]:
load = True

### RuBERT-conversational

In [None]:
num_distinct_models = 2
strides = [0.8, 0.4]
ckpt = ["checkpoint-1476-epoch-3", "checkpoint-792-epoch-3"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=128,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=3,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f"models/rubert-base-cased-conversational/last{i}"
    )

    model = ClassificationModel(
        "bert", 
        f"models/rubert-base-cased-conversational" + (f"/last{i}/{ckpt[i]}" if load else ""),
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

### RuBERT-sber-large

In [None]:
num_distinct_models = 2
strides = [0.8, 0.4]
ckpt = ["checkpoint-11814-epoch-3", "checkpoint-6339-epoch-3"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=16,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=3,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f'models/sberbank_ruBert_large/last{i}'
    )

    model = ClassificationModel(
        "bert", 
        f"models/sberbank_ruBert_large" + (f"/last{i}/{ckpt[i]}" if load else ""), 
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

### XLM-RoBERTa-large

In [None]:
num_distinct_models = 2
strides = [0.8, 0.4]
ckpt = ["checkpoint-5378-epoch-2", "checkpoint-2842-epoch-2"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=32,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=2,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f'models/xlm-roberta-large-qa-multilingual-finedtuned-ru/last{i}'
    )

    model = ClassificationModel(
        "xlmroberta", 
        f"models/xlm-roberta-large-qa-multilingual-finedtuned-ru" + (f"/last{i}/{ckpt[i]}" if load else ""), 
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

### Sber RuBERT

In [None]:
num_distinct_models = 1
strides = [0.8, 0.4]
ckpt = ["checkpoint-792-epoch-3"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=128,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=3,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f"models/sber-rubert/last{i}"
    )

    model = ClassificationModel(
        "bert", 
        f"models/sber-rubert" + (f"/last{i}/{ckpt[i]}" if load else ""),
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

### RuBERT sentence

In [None]:
num_distinct_models = 1
strides = [0.8, 0.4]
ckpt = ["checkpoint-798-epoch-3"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=128,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=3,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f"models/rubert-sentence/last{i}"
    )

    model = ClassificationModel(
        "bert", 
        f"models/rubert-sentence" + (f"/last{i}/{ckpt[i]}" if load else ""),
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

### RuBERT-base-cased

In [None]:
num_distinct_models = 1
strides = [0.8, 0.4]
ckpt = ["checkpoint-816-epoch-3"]

In [None]:
for i in range(num_distinct_models):
    model_args = ClassificationArgs(
        train_batch_size=128,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        num_train_epochs=3,
        overwrite_output_dir=True,
        sliding_window=True,
        stride=strides[i],
        weight_decay=1e-8,
        output_dir=f"models/rubert-base-cased/last{i}"
    )

    model = ClassificationModel(
        "bert", 
        f"models/rubert-base-cased" + (f"/last{i}/{ckpt[i]}" if load else ""),
        num_labels=num_classes,
        args=model_args
    )
    models.append(model)

In [None]:
print("Models: ")
for i, model in enumerate(models):
    print(f"{i : >2}. {model.config._name_or_path}")

## Train

In [None]:
def agg_preds(preds, num_classes):
    voted_preds = []
    ties = 0
    for i in range(len(preds[0])):
        votes = [0 for _ in range(num_classes)]
        for j in range(len(preds)):
            votes[preds[j][i]] += 1

        tie = sorted(votes)
        if tie[-1] == tie[-2]:
            print(f"tie {i}: {votes}")
            ties += 1
            voted_preds.append(-1)
        else:
            voted_preds.append(votes.index(tie[-1]))
    return voted_preds, ties

In [None]:
def agg_probas(probas, num_classes):
    voted_preds = []
    ties = 0
    for i in range(len(probas[0])):
        votes = [0 for _ in range(num_classes)]
        for j in range(len(probas)):
            for t in range(num_classes):
                votes[t] += probas[j][i][t]

        tie = sorted(votes)
        if tie[-1] < tie[-2] * 1.01:
            print(f"tie {i}: {votes}")
            ties += 1
            voted_preds.append(-1)
        else:
            voted_preds.append(votes.index(tie[-1]))
    return voted_preds, ties

In [None]:
for i, model in enumerate(models):
    model.train_model(subsets[i])
    torch.cuda.empty_cache()

## Eval

In [None]:
scores = []
preds = []
raws = []
for model in models:
    predictions, raw_outputs = model.predict(eval_df['text'].to_list())
    score, _ = evaluate(raw_outputs, eval_df)
    raws.append(raw_outputs)
    preds.append(predictions)
    scores.append(score)

In [None]:
voted_preds, ties = agg_preds(preds, num_classes)

In [None]:
# probas = []
# for i in range(len(raws)):
#     median_outputs = np.array([output.sum(axis=0) for output in raws[i]])
#     softs = softmax(median_outputs, axis=1)
#     probas.append(softs)
# voted_preds, ties = agg_probas(probas, num_classes)

In [None]:
labels_real = np.array(eval_df['label'].tolist())
labels_pred = np.array(voted_preds)

In [None]:
score = (np.count_nonzero(labels_real == labels_pred) - np.count_nonzero(labels_real != labels_pred) + ties) / len(eval_df)
score

In [None]:
for score in scores:
    print(score)

## Create commit

In [None]:
test_df = pd.read_csv('data/test.csv')
test_df = test_df.groupby('oid').agg({'text' : lambda texts: ' '.join(texts)})
oids = test_df.index
test_df.index = range(len(test_df))
test_df.head()

In [None]:
test_valid_words = filter_texts(test_df, drop_rate=None)
test_df['text'] = [" ".join(filter_words(words)) for words in test_valid_words]
test_df

In [None]:
preds_test = []
raws_test = []
for model in models:
    predictions, raw_outputs = model.predict(test_df['text'].to_list())
    raws_test.append(raw_outputs)
    preds_test.append(predictions)    

In [None]:
voted_preds, ties = agg_preds(preds_test, num_classes)

In [None]:
# probas_test = []
# for i in range(len(raws_test)):
#     median_outputs = np.array([output.sum(axis=0) for output in raws_test[i]])
#     softs = softmax(median_outputs, axis=1)
#     probas_test.append(softs)
        
# voted_preds, ties = agg_probas(probas_test, num_classes)

In [None]:
csv = pd.DataFrame({'oid':oids, 'category':voted_preds})

In [None]:
csv.set_index('oid')
csv = csv[csv['category'] > -1]
csv

In [None]:
csv['category'] = csv['category'].map(lambda cat: cat_dict[cat])

In [None]:
filepath = Path('cache_dfs/submission.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
csv.to_csv(filepath, index=False)