In [None]:
import pandas as pd

df = pd.read_excel("parsed_data.xlsx")

In [None]:
df

In [None]:
df = df.dropna(subset=['title', 'overview', 'text'], how='all')

In [None]:
df

In [None]:
df[['title', 'overview', 'text']] = df[['title', 'overview', 'text']].fillna('')

df['fulltext'] = df['title'] + ' ' + df['overview'] + ' ' + df['text']

In [None]:
df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install datasets
# !pip install --upgrade transformers

### Data Analitics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer

# Инициализируем токенайзер BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Проверка баланса классов
category_counts = df['category'].value_counts(normalize=True)

# Проверка дубликатов
duplicate_count = df.duplicated(subset=['fulltext']).sum()

# Category distribution visualization
plt.figure(figsize=(10, 5))
category_counts.plot(kind='bar')
plt.title('Category Distribution')
plt.ylabel('Count')
plt.xlabel('Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Печатаем основные выводы
print("=== Анализ текста для BERT ===")
print(f"Общее количество текстов: {len(df)}")
print(f"Количество уникальных категорий: {df['category'].nunique()}")
print(f"Количество дубликатов по полному тексту: {duplicate_count}")
print("\n=== Топ-10 категорий по количеству:")
print(df['category'].value_counts().head(10))

### Data preparation

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# Загрузка модели и токенизатора
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Сбалансированный срез
sample_counts = {
    'Политика': 1000,
    'Общество': 500,
    'Спорт': 500,
    'Авто': 500,
    'Бизнес': 100,
    'Технологии и медиа': 100,
    'Экономика': 100,
    'Финансы': 100,
    'База знаний': 50
}

train_df = pd.concat([
    df[df['category'] == cat].sample(n=sample_counts[cat], random_state=42)
    for cat in sample_counts
])

test_df = df.drop(train_df.index)

# Подготовка текста и меток
train_df['fulltext'] = (
    train_df['title'].fillna('') + ' ' +
    train_df['overview'].fillna('') + ' ' +
    train_df['text'].fillna('')
)
test_df['fulltext'] = (
    test_df['title'].fillna('') + ' ' +
    test_df['overview'].fillna('') + ' ' +
    test_df['text'].fillna('')
)

label2id = {label: idx for idx, label in enumerate(train_df['category'].unique())}
id2label = {v: k for k, v in label2id.items()}

train_df['label'] = train_df['category'].map(label2id)
test_df['label'] = test_df['category'].map(label2id)

# Создание датасетов
train_dataset = Dataset.from_pandas(train_df[['fulltext', 'label']])
eval_dataset = Dataset.from_pandas(test_df[['fulltext', 'label']])

# Токенизация
def tokenize(example):
    return tokenizer(example['fulltext'], truncation=True, padding='max_length', max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

### ruBERT model

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

# Загрузка модели
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

# Аргументы обучения (одна эпоха за раз)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    report_to='none'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Цикл по эпохам
num_epochs = 20
acc_history = []
f1_history = []

best_acc = 0.0
best_f1 = 0.0

for epoch in range(num_epochs):
    print(f"\n Эпоха {epoch+1}/{num_epochs}")
    trainer.train()

    # Предсказания
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids

    # Метрики
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    acc_history.append(acc)
    f1_history.append(f1)

    print(f" Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")

    #Classification report
    print("\n Classification Report:")
    print(classification_report(labels, preds, target_names=[id2label[i] for i in range(len(id2label))]))

    # Сохраняем модель если и accuracy и f1 улучшились
    if acc > best_acc and f1 > best_f1:
        best_acc = acc
        best_f1 = f1
        model.save_pretrained('./best_model')
        tokenizer.save_pretrained('./best_model')
        print(" Лучшая модель сохранена в ./best_model")

# Сохраняем финальную модель
model.save_pretrained('./rbc_bert_classifier_new_data')
tokenizer.save_pretrained('./rbc_bert_classifier_new_data')
print(" Финальная модель сохранена в ./rbc_bert_classifier_new_data")

# Построение графиков
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), acc_history, label='Accuracy')
plt.plot(range(1, num_epochs+1), f1_history, label='Macro F1')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Accuracy & F1 per Epoch')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
!cp -r ./best_model /content/drive/MyDrive/
!cp -r ./rbc_bert_classifier_new_data /content/drive/MyDrive/

### Bert Topic. Does not in work just try

In [None]:
# # 📦 Установка зависимостей
# !pip install -q bertopic[visualization]
# !pip install -q sentence-transformers
# !pip install -q umap-learn

In [None]:
import pandas as pd
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

data['full_text'] = data[['title', 'overview', 'text']].fillna('').agg(' '.join, axis=1)
data['clean_text'] = data['full_text'].apply(clean_text)

# Эмбеддинги через Sentence-BERT
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
documents = data['clean_text'].tolist()
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# Построение BERTopic
topic_model = BERTopic(language="multilingual", embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(documents, embeddings)

# Присваиваем темы обратно к data
data['topic'] = topics

# Сопоставляем темы с категориями
topic_to_category = (
    data.groupby('topic')['category']
    .agg(lambda x: x.value_counts().index[0])
    .to_dict()
)

# Заменяем номер темы на название категории
data['predicted_category'] = data['topic'].map(topic_to_category)

# Заменим названия тем внутри модели
topic_model.set_topic_labels(topic_to_category)

# Сокращаем количество тем до 10
topic_model.reduce_topics(documents, nr_topics=10)

# Перезаписываем топики и категории после сокращения
data['reduced_topic'] = topic_model.topics_
reduced_topic_to_category = (
    data.groupby('reduced_topic')['category']
    .agg(lambda x: x.value_counts().index[0])
    .to_dict()
)
data['predicted_category'] = data['reduced_topic'].map(reduced_topic_to_category)
topic_model.set_topic_labels(reduced_topic_to_category)

In [None]:
topic_model.visualize_topics()


In [None]:
topic_model.visualize_barchart(top_n_topics=10)


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

comparison_df = data.dropna(subset=['category', 'predicted_category'])
acc = accuracy_score(comparison_df['category'], comparison_df['predicted_category'])
print(f"\n🎯 Accuracy: {acc:.4f}")

# Classification report
print("\n📊 Classification Report:")
print(classification_report(comparison_df['category'], comparison_df['predicted_category'], zero_division=0))

# Confusion matrix
cm = confusion_matrix(comparison_df['category'], comparison_df['predicted_category'], labels=comparison_df['category'].unique())

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=comparison_df['category'].unique(), yticklabels=comparison_df['category'].unique(), cmap='Blues')
plt.xlabel('Предсказанная категория')
plt.ylabel('Реальная категория')
plt.title('Матрица ошибок (Confusion Matrix)')
plt.tight_layout()
plt.show()


In [None]:
topic_model.save("bertopic_model")
data[['title', 'topic', 'full_text']].to_csv("texts_with_topics.csv", index=False)