In [None]:
import pandas as pd

df = pd.read_excel("final_dataset_limited_600_per_category.xlsx")

In [None]:
df = df.dropna(subset=['title', 'overview', 'text'], how='all')

In [None]:
df[['title', 'overview', 'text']] = df[['title', 'overview', 'text']].fillna('')

df['fulltext'] = df['title'] + ' ' + df['overview'] + ' ' + df['text']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install datasets
# !pip install --upgrade transformers

### Data Analitics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer

# Инициализируем токенайзер BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Проверка баланса классов
category_counts = df['category'].value_counts(normalize=True)

# Проверка дубликатов
duplicate_count = df.duplicated(subset=['fulltext']).sum()

# Category distribution visualization
plt.figure(figsize=(10, 5))
category_counts.plot(kind='bar')
plt.title('Category Distribution')
plt.ylabel('Count')
plt.xlabel('Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Печатаем основные выводы
print("=== Анализ текста для BERT ===")
print(f"Общее количество текстов: {len(df)}")
print(f"Количество уникальных категорий: {df['category'].nunique()}")
print(f"Количество дубликатов по полному тексту: {duplicate_count}")
print("\n=== Топ-10 категорий по количеству:")
print(df['category'].value_counts().head(10))

### Data preparation

In [None]:
# !pip install datasets

In [None]:
from transformers import BertTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Загрузка модели и токенизатора
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Создание колонки fulltext
df['fulltext'] = (
    df['title'].fillna('') + ' ' +
    df['overview'].fillna('') + ' ' +
    df['text'].fillna('')
)

# Разделение на train и test с сохранением пропорций классов
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['category']
)

# Создание словарей меток
label2id = {label: idx for idx, label in enumerate(df['category'].unique())}
id2label = {v: k for k, v in label2id.items()}

# Преобразование меток
train_df['label'] = train_df['category'].map(label2id)
test_df['label'] = test_df['category'].map(label2id)

# Преобразование в HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['fulltext', 'label']])
eval_dataset = Dataset.from_pandas(test_df[['fulltext', 'label']])

# Токенизация
def tokenize(example):
    return tokenizer(example['fulltext'], truncation=True, padding='max_length', max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)


### ruBERT model

In [None]:
!pip install -U transformers

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

# Загрузка модели
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

# Аргументы обучения (одна эпоха)
base_training_args = dict(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    report_to='none',
    save_strategy='no',
    seed=42,
    max_grad_norm=1.0,
    learning_rate=2e-5,
    warmup_steps=100
)

# Истории метрик
acc_history = []
f1_history = []
best_acc = 0.0
best_f1 = 0.0

# Early stopping
patience = 2
no_improve_epochs = 0

# Цикл по эпохам
for epoch in range(10):
    print(f"\n📘 Эпоха {epoch + 1}/10")

    training_args = TrainingArguments(**base_training_args)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset.shuffle(seed=epoch),
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    # Оценка
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    acc_history.append(acc)
    f1_history.append(f1)

    print(f"✅ Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")
    print(classification_report(labels, preds, target_names=[id2label[i] for i in range(len(id2label))]))

    # Early stopping по F1
    if f1 > best_f1:
        best_f1 = f1
        best_acc = acc
        no_improve_epochs = 0
        model.save_pretrained('./best_model_final')
        tokenizer.save_pretrained('./best_model_final')
        print("💾 Лучшая модель сохранена в ./best_model_final")
    else:
        no_improve_epochs += 1
        print(f"⚠️  Нет улучшения F1 ({no_improve_epochs}/{patience})")

    if no_improve_epochs >= patience:
        print("⛔ Early stopping: F1 не улучшается.")
        break

# Сохраняем финальную модель
model.save_pretrained('./final_model_early_stopped')
tokenizer.save_pretrained('./final_model_early_stopped')
print("🎯 Финальная модель сохранена в ./final_model_early_stopped")

# График
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(acc_history) + 1), acc_history, label='Accuracy')
plt.plot(range(1, len(f1_history) + 1), f1_history, label='Macro F1')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Accuracy & Macro F1 per Epoch')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
!cp -r ./best_model_final /content/drive/MyDrive/
!cp -r ./final_model_early_stopped /content/drive/MyDrive/

### Поиск ключевых слов для каждой категории

In [None]:
!pip install natasha
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
from natasha import MorphVocab, Doc, NewsEmbedding, NewsMorphTagger, Segmenter
from tqdm import tqdm
import nltk

tqdm.pandas()

# ======= Natasha =======
segmenter = Segmenter()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
morph_vocab = MorphVocab()

# ======= Стоп-слова =======
russian_stopwords = stopwords.words("russian")
custom_stopwords = set([
    'также', 'который', 'которые', 'например', 'года',
    'будет', 'данный', 'далее', 'нужно', 'может','новый', 'мочь',
    'россия', 'российский', 'американский', 'украинский'
])
stop_words = set(russian_stopwords).union(custom_stopwords)

# ======= Функция очистки и лемматизации =======
def preprocess(text):
    text = str(text)
    text = re.sub(r'[^А-Яа-яЁё ]', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if not word[0].isupper()]
    text = ' '.join(tokens)
    text = text.lower()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    lemmas = []
    for token in doc.tokens:
        if token.pos not in ['CONJ', 'ADP', 'PRCL', 'INTJ'] and \
           len(token.text) > 3 and token.pos != 'PROPN':
            token.lemmatize(morph_vocab)
            lemma = token.lemma
            if lemma not in stop_words and len(lemma) > 3:
                lemmas.append(lemma)
    return lemmas

# ======= Применяем обработку =======
df['lemmas'] = df['text'].progress_apply(preprocess)

# ======= Считаем слова по категориям =======
theme_word_counts = {}

for theme, group in df.groupby('category'):
    words = []
    for lemmas in group['lemmas']:
        words.extend(lemmas)
    word_freq = Counter(words)
    theme_word_counts[theme] = word_freq.most_common(20)

# ======= Вывод результатов =======
for theme, words in theme_word_counts.items():
    print(f"\nКатегория: {theme}")
    for word, count in words:
        print(f"  {word}: {count}")
