In [None]:
# 1. Установка библиотек (если ещё не установлены)
!pip install numpy==1.25.2 --no-cache-dir
!pip install catboost==1.2.5
!pip install sentence-transformers



In [None]:
# Подключаем Google Диск
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


# 2. Импорт библиотек
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time

# Функция форматирования времени
def format_time(seconds):
    mins, secs = divmod(int(seconds), 60)
    hours, mins = divmod(mins, 60)
    return f"{hours}h {mins}m {secs}s"

start_total = time.time()

# 3. Загрузка и подготовка данных
print("📄 Загрузка данных...")
start = time.time()
df = pd.read_excel("parsed_data.xlsx")
df = df.dropna(subset=["title", "body", "category"])  # Если есть разметка
df["text"] = df["title"] + " " + df["body"]
elapsed = time.time() - start
print(f"learn: load_data | total: {format_time(elapsed)} | remaining: 0h 00m 00s\n")

# 4. Векторизация текста через RuBERT
print("🔎 Векторизация через RuBERT...")
start = time.time()
bert = SentenceTransformer("cointegrated/rubert-tiny")
X = bert.encode(df["text"].tolist(), show_progress_bar=True)
y = df["category"].tolist()
elapsed = time.time() - start
print(f"learn: rubert_embed | total: {format_time(elapsed)} | remaining: 0h 00m 00s\n")

# 5. Обучение модели
print("🧠 Обучение модели...")
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=10,
    auto_class_weights='Balanced',
    loss_function='MultiClass',
    verbose=100,
    random_seed=42
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=150)
elapsed = time.time() - start
print(f"learn: catboost_train | total: {format_time(elapsed)} | remaining: 0h 00m 00s\n")

# 6. Сохраняем модель
model.save_model("catboost_news_model_rubert_1.cbm")

# Копируем на Google Диск
import shutil
destination_path = "/content/drive/MyDrive/catboost_news_model_rubert_1.cbm"
shutil.copy("catboost_news_model_rubert_1.cbm", destination_path)
print(f"Модель сохранена в: {destination_path}")



📄 Загрузка данных...




learn: load_data | total: 0h 0m 4s | remaining: 0h 00m 00s

🔎 Векторизация через RuBERT...


Batches:   0%|          | 0/267 [00:00<?, ?it/s]

learn: rubert_embed | total: 0h 0m 5s | remaining: 0h 00m 00s

🧠 Обучение модели...
0:	learn: 2.1470852	test: 2.1721672	best: 2.1721672 (0)	total: 7.02s	remaining: 1h 56m 49s
100:	learn: 0.5380612	test: 1.4319584	best: 1.4319584 (100)	total: 12m 46s	remaining: 1h 53m 38s
200:	learn: 0.2594908	test: 1.3768473	best: 1.3744774 (186)	total: 25m 25s	remaining: 1h 41m 5s
300:	learn: 0.1572247	test: 1.3921350	best: 1.3742907 (230)	total: 37m 55s	remaining: 1h 28m 4s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 1.374290743
bestIteration = 230

Shrink model to first 231 iterations.
learn: catboost_train | total: 0h 47m 55s | remaining: 0h 00m 00s

Модель сохранена в: /content/drive/MyDrive/catboost_news_model_rubert_1.cbm
📊 Предсказания...


ValueError: 2

In [None]:
print("Предсказания...")
start = time.time()

labels = model.predict(X).squeeze()
df["predicted_category"] = labels

elapsed = time.time() - start
print(f"learn: predict_all | total: {format_time(elapsed)} | remaining: 0h 00m 00s\n")

#результат
df.to_excel("parsed_data_with_predictions.xlsx", index=False)
total_elapsed = time.time() - start_total

print("Готово: модель обучена, предсказания сохранены в parsed_data_with_predictions.xlsx")
print(f"Общее время выполнения: {format_time(total_elapsed)}")

📊 Предсказания...
learn: predict_all | total: 0h 0m 0s | remaining: 0h 00m 00s

✅ Готово: модель обучена, предсказания сохранены в parsed_data_with_predictions.xlsx
🏁 Общее время выполнения: 0h 57m 22s


In [None]:
from sklearn.metrics import classification_report

#Предсказания на тестовой выборке
y_pred = model.predict(X_test).squeeze()

#Отчёт
report = classification_report(y_test, y_pred, digits=4)
print(report)


                    precision    recall  f1-score   support

              Авто     0.8473    0.8600    0.8536       200
       База знаний     0.0000    0.0000    0.0000         1
            Бизнес     0.4941    0.4828    0.4884        87
          Общество     0.4709    0.6287    0.5385       167
          Политика     0.8304    0.8316    0.8310       677
             Спорт     0.9078    0.8571    0.8817       448
Технологии и медиа     0.6667    0.2381    0.3509        42
           Финансы     0.4359    0.4474    0.4416        38
         Экономика     0.3500    0.3043    0.3256        46

          accuracy                         0.7661      1706
         macro avg     0.5559    0.5167    0.5235      1706
      weighted avg     0.7741    0.7661    0.7663      1706



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
