In [None]:
#Установка библиотек
!pip install numpy==1.25.2 --no-cache-dir
!pip install catboost==1.2.5
!pip install sentence-transformers

import pandas as pd
import joblib
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

#Загрузка данных
df = pd.read_excel("parsed_data_with_word_count.xlsx")

#Очистка пропусков
df_clean = df.dropna(subset=["text", "category"])

#Подготовка данных
X = df_clean[["text", "word_count"]]
y = df_clean["category"]

#TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf.fit_transform(X["text"])

#Добавление word_count
X_combined = scipy.sparse.hstack([X_tfidf, X["word_count"].values.reshape(-1, 1)])

#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

#Обучение CatBoost
model = CatBoostClassifier(verbose=100)
model.fit(X_train, y_train)

#Метрики
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

#Сохранение весов модели и TF-IDF
model.save_model("catboost_rbc_model.cbm")
joblib.dump(tfidf, "tfidf_vectorizer_rbc.pkl")

Learning rate set to 0.089975
0:	learn: 1.9128244	total: 13s	remaining: 3h 36m 17s
100:	learn: 0.5591442	total: 13m 43s	remaining: 2h 2m 11s
200:	learn: 0.4567284	total: 27m 3s	remaining: 1h 47m 32s
300:	learn: 0.4124836	total: 40m 16s	remaining: 1h 33m 31s
400:	learn: 0.3858358	total: 53m 34s	remaining: 1h 20m 2s
500:	learn: 0.3663308	total: 1h 6m 48s	remaining: 1h 6m 32s
600:	learn: 0.3505624	total: 1h 19m 52s	remaining: 53m 1s
700:	learn: 0.3376871	total: 1h 32m 51s	remaining: 39m 36s
800:	learn: 0.3259395	total: 1h 45m 53s	remaining: 26m 18s
900:	learn: 0.3146018	total: 1h 59m 2s	remaining: 13m 4s
999:	learn: 0.3054815	total: 2h 12m 6s	remaining: 0us


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                    precision    recall  f1-score   support

              Авто       0.97      1.00      0.98       201
       База знаний       0.00      0.00      0.00         4
            Бизнес       0.57      0.44      0.50       104
          Общество       0.73      0.79      0.76       696
          Политика       0.86      0.91      0.89      1461
             Спорт       0.99      0.93      0.96       450
Технологии и медиа       0.78      0.42      0.55        73
           Финансы       0.88      0.61      0.72        49
         Экономика       0.59      0.29      0.39        68

          accuracy                           0.84      3106
         macro avg       0.71      0.60      0.64      3106
      weighted avg       0.84      0.84      0.84      3106



['tfidf_vectorizer_rbc.pkl']