In [1]:
from typing import List, Tuple

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import spacy
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_sib200_ru() -> Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]], Tuple[List[str], List[int]], List[str]]:
    trainset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='train')
    X_train = trainset['text']
    y_train = trainset['category']
    valset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='validation')
    X_val = valset['text']
    y_val = valset['category']
    testset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='test')
    X_test = testset['text']
    y_test = testset['category']
    categories = set(y_train)
    unknown_categories = set(y_val) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the validation set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    unknown_categories = set(y_test) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the test set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    categories = sorted(list(categories))
    y_train = [categories.index(it) for it in y_train]
    y_val = [categories.index(it) for it in y_val]
    y_test = [categories.index(it) for it in y_test]
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

In [4]:
def normalize_text(s: str, nlp_pipeline: spacy.Language) -> str:
    doc = nlp_pipeline(s)
    lemmas = [('<NUM>' if token.like_num else token.lemma_.lower()) for token in filter(lambda it1: not it1.is_punct, doc)]
    if len(lemmas) == 0:
        return ''
    return ' '.join(lemmas)

In [5]:
train_data, val_data, test_data, classes_list = load_sib200_ru()

In [6]:
print(f'Categories: {classes_list}')

Categories: ['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']


In [7]:
print(len(train_data[0]))
print(len(train_data[1]))

701
701


In [8]:
print(len(val_data[0]))
print(len(val_data[1]))

99
99


In [9]:
print(len(test_data[0]))
print(len(test_data[1]))

204
204


In [11]:
nlp = spacy.load('ru_core_news_sm')

In [12]:
print(train_data[0][0])

Турция с трёх сторон окружена морями: на западе — Эгейским, на севере — Чёрным и на юге — Средиземным.


In [13]:
print(normalize_text(train_data[0][0], nlp))

турция с <NUM> сторона окружить морями на запад эгейским на север чёрный и на юг средиземный


In [14]:
print(val_data[0][0])

Если увеличить расстояние для бега с четверти до половины мили, скорость становится не так важна, тогда как выносливость превращается в абсолютную необходимость.


In [15]:
print(normalize_text(val_data[0][0], nlp))

если увеличить расстояние для бег с <NUM> до <NUM> миля скорость становиться не так важный тогда как выносливость превращаться в абсолютный необходимость


In [16]:
print(test_data[0][0])

Мутация вносит новую генетическую вариацию, в то время как отбор убирает её из набора проявляющихся вариаций.


In [17]:
print(normalize_text(test_data[0][0], nlp))

мутация вносить новый генетический вариация в тот время как отбор убирать её из набор проявляться вариация


In [18]:
class_probability = 1.0 / len(classes_list)
max_df = 1.0 - 0.2 * class_probability
print(f'Maximal document frequency of term is {max_df}.')

Maximal document frequency of term is 0.9714285714285714.


In [19]:
classifier = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1)),
    ('cls', LogisticRegression(solver='saga', max_iter=100, random_state=42))
])

  ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1)),


In [20]:
cv = GridSearchCV(
    estimator=classifier,
    param_grid={
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'cls__C': [1e-1, 1, 10, 100, 1000],
        'cls__penalty': ['l1', 'l2']
    },
    scoring='f1_macro',
    cv=5,
    refit=True,
    n_jobs=-1,
    verbose=True
)

In [21]:
cv.fit([normalize_text(it, nlp) for it in train_data[0]], train_data[1])

Fitting 5 folds for each of 30 candidates, totalling 150 fits




In [22]:
print('Best parameters:')
print(cv.best_params_)

Best parameters:
{'cls__C': 1000, 'cls__penalty': 'l2', 'vectorizer__ngram_range': (1, 1)}


In [23]:
print('Best F1-macro:')
print(cv.best_score_)

Best F1-macro:
0.6451646278967693


In [24]:
print(f'Vocabulary size is {len(cv.best_estimator_.named_steps["vectorizer"].vocabulary_)}.')

Vocabulary size is 4359.


In [26]:
y_pred = cv.predict([normalize_text(it, nlp) for it in test_data[0]])
print(classification_report(y_true=test_data[1], y_pred=y_pred, target_names=classes_list))

                    precision    recall  f1-score   support

     entertainment       0.89      0.42      0.57        19
         geography       0.64      0.53      0.58        17
            health       0.47      0.41      0.44        22
          politics       0.78      0.83      0.81        30
science/technology       0.65      0.78      0.71        51
            sports       0.87      0.80      0.83        25
            travel       0.62      0.70      0.66        40

          accuracy                           0.68       204
         macro avg       0.70      0.64      0.66       204
      weighted avg       0.69      0.68      0.68       204

