In [None]:
import pandas as pd

df = pd.read_excel("final_dataset_limited_600_per_category.xlsx")

In [None]:
df = df.dropna(subset=['title', 'overview', 'text'], how='all')

In [None]:
df[['title', 'overview', 'text']] = df[['title', 'overview', 'text']].fillna('')

df['fulltext'] = df['title'] + ' ' + df['overview'] + ' ' + df['text']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install datasets
# !pip install --upgrade transformers

### Data Analitics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –±–∞–ª–∞–Ω—Å–∞ –∫–ª–∞—Å—Å–æ–≤
category_counts = df['category'].value_counts(normalize=True)

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤
duplicate_count = df.duplicated(subset=['fulltext']).sum()

# Category distribution visualization
plt.figure(figsize=(10, 5))
category_counts.plot(kind='bar')
plt.title('Category Distribution')
plt.ylabel('Count')
plt.xlabel('Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# –ü–µ—á–∞—Ç–∞–µ–º –æ—Å–Ω–æ–≤–Ω—ã–µ –≤—ã–≤–æ–¥—ã
print("=== –ê–Ω–∞–ª–∏–∑ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è BERT ===")
print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—Å—Ç–æ–≤: {len(df)}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –∫–∞—Ç–µ–≥–æ—Ä–∏–π: {df['category'].nunique()}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –ø–æ –ø–æ–ª–Ω–æ–º—É —Ç–µ–∫—Å—Ç—É: {duplicate_count}")
print("\n=== –¢–æ–ø-10 –∫–∞—Ç–µ–≥–æ—Ä–∏–π –ø–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤—É:")
print(df['category'].value_counts().head(10))

### Data preparation

In [None]:
# !pip install datasets

In [None]:
from transformers import BertTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# –°–æ–∑–¥–∞–Ω–∏–µ –∫–æ–ª–æ–Ω–∫–∏ fulltext
df['fulltext'] = (
    df['title'].fillna('') + ' ' +
    df['overview'].fillna('') + ' ' +
    df['text'].fillna('')
)

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ train –∏ test —Å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ–º –ø—Ä–æ–ø–æ—Ä—Ü–∏–π –∫–ª–∞—Å—Å–æ–≤
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['category']
)

# –°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä–µ–π –º–µ—Ç–æ–∫
label2id = {label: idx for idx, label in enumerate(df['category'].unique())}
id2label = {v: k for k, v in label2id.items()}

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –º–µ—Ç–æ–∫
train_df['label'] = train_df['category'].map(label2id)
test_df['label'] = test_df['category'].map(label2id)

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['fulltext', 'label']])
eval_dataset = Dataset.from_pandas(test_df[['fulltext', 'label']])

# –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è
def tokenize(example):
    return tokenizer(example['fulltext'], truncation=True, padding='max_length', max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)


### ruBERT model

In [None]:
!pip install -U transformers

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

# –ê—Ä–≥—É–º–µ–Ω—Ç—ã –æ–±—É—á–µ–Ω–∏—è (–æ–¥–Ω–∞ —ç–ø–æ—Ö–∞)
base_training_args = dict(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    report_to='none',
    save_strategy='no',
    seed=42,
    max_grad_norm=1.0,
    learning_rate=2e-5,
    warmup_steps=100
)

# –ò—Å—Ç–æ—Ä–∏–∏ –º–µ—Ç—Ä–∏–∫
acc_history = []
f1_history = []
best_acc = 0.0
best_f1 = 0.0

# Early stopping
patience = 2
no_improve_epochs = 0

# –¶–∏–∫–ª –ø–æ —ç–ø–æ—Ö–∞–º
for epoch in range(10):
    print(f"\nüìò –≠–ø–æ—Ö–∞ {epoch + 1}/10")

    training_args = TrainingArguments(**base_training_args)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset.shuffle(seed=epoch),
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    # –û—Ü–µ–Ω–∫–∞
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    acc_history.append(acc)
    f1_history.append(f1)

    print(f"‚úÖ Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")
    print(classification_report(labels, preds, target_names=[id2label[i] for i in range(len(id2label))]))

    # Early stopping –ø–æ F1
    if f1 > best_f1:
        best_f1 = f1
        best_acc = acc
        no_improve_epochs = 0
        model.save_pretrained('./best_model_final')
        tokenizer.save_pretrained('./best_model_final')
        print("üíæ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ ./best_model_final")
    else:
        no_improve_epochs += 1
        print(f"‚ö†Ô∏è  –ù–µ—Ç —É–ª—É—á—à–µ–Ω–∏—è F1 ({no_improve_epochs}/{patience})")

    if no_improve_epochs >= patience:
        print("‚õî Early stopping: F1 –Ω–µ —É–ª—É—á—à–∞–µ—Ç—Å—è.")
        break

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—É—é –º–æ–¥–µ–ª—å
model.save_pretrained('./final_model_early_stopped')
tokenizer.save_pretrained('./final_model_early_stopped')
print("üéØ –§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ ./final_model_early_stopped")

# –ì—Ä–∞—Ñ–∏–∫
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(acc_history) + 1), acc_history, label='Accuracy')
plt.plot(range(1, len(f1_history) + 1), f1_history, label='Macro F1')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Accuracy & Macro F1 per Epoch')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
!cp -r ./best_model_final /content/drive/MyDrive/
!cp -r ./final_model_early_stopped /content/drive/MyDrive/

### –ü–æ–∏—Å–∫ –∫–ª—é—á–µ–≤—ã—Ö —Å–ª–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–π –∫–∞—Ç–µ–≥–æ—Ä–∏–∏

In [None]:
!pip install natasha
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
from natasha import MorphVocab, Doc, NewsEmbedding, NewsMorphTagger, Segmenter
from tqdm import tqdm
import nltk

tqdm.pandas()

# ======= Natasha =======
segmenter = Segmenter()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
morph_vocab = MorphVocab()

# ======= –°—Ç–æ–ø-—Å–ª–æ–≤–∞ =======
russian_stopwords = stopwords.words("russian")
custom_stopwords = set([
    '—Ç–∞–∫–∂–µ', '–∫–æ—Ç–æ—Ä—ã–π', '–∫–æ—Ç–æ—Ä—ã–µ', '–Ω–∞–ø—Ä–∏–º–µ—Ä', '–≥–æ–¥–∞',
    '–±—É–¥–µ—Ç', '–¥–∞–Ω–Ω—ã–π', '–¥–∞–ª–µ–µ', '–Ω—É–∂–Ω–æ', '–º–æ–∂–µ—Ç','–Ω–æ–≤—ã–π', '–º–æ—á—å',
    '—Ä–æ—Å—Å–∏—è', '—Ä–æ—Å—Å–∏–π—Å–∫–∏–π', '–∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–π', '—É–∫—Ä–∞–∏–Ω—Å–∫–∏–π'
])
stop_words = set(russian_stopwords).union(custom_stopwords)

# ======= –§—É–Ω–∫—Ü–∏—è –æ—á–∏—Å—Ç–∫–∏ –∏ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ =======
def preprocess(text):
    text = str(text)
    text = re.sub(r'[^–ê-–Ø–∞-—è–Å—ë ]', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if not word[0].isupper()]
    text = ' '.join(tokens)
    text = text.lower()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    lemmas = []
    for token in doc.tokens:
        if token.pos not in ['CONJ', 'ADP', 'PRCL', 'INTJ'] and \
           len(token.text) > 3 and token.pos != 'PROPN':
            token.lemmatize(morph_vocab)
            lemma = token.lemma
            if lemma not in stop_words and len(lemma) > 3:
                lemmas.append(lemma)
    return lemmas

# ======= –ü—Ä–∏–º–µ–Ω—è–µ–º –æ–±—Ä–∞–±–æ—Ç–∫—É =======
df['lemmas'] = df['text'].progress_apply(preprocess)

# ======= –°—á–∏—Ç–∞–µ–º —Å–ª–æ–≤–∞ –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º =======
theme_word_counts = {}

for theme, group in df.groupby('category'):
    words = []
    for lemmas in group['lemmas']:
        words.extend(lemmas)
    word_freq = Counter(words)
    theme_word_counts[theme] = word_freq.most_common(20)

# ======= –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ =======
for theme, words in theme_word_counts.items():
    print(f"\n–ö–∞—Ç–µ–≥–æ—Ä–∏—è: {theme}")
    for word, count in words:
        print(f"  {word}: {count}")
