In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack


In [2]:
df = pd.read_csv('df_andersen_clean_for_LLM.csv')
y = df["recommended_grade"].astype(str)

# текстовые поля для TF-IDF
text_cols = ["course_name", "summary", "category_name", "course_structure"]

# объединяем все тексты в один, так проще
df["text_all"] = df[text_cols].fillna("").agg(" ".join, axis=1)

# категориальные признаки
cat_cols = [] 

In [3]:
"""кодируем, на этот раз нормально"""

# TF-IDF для текста
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(df["text_all"])

# категориальные признаки
X_cat = []
for col in cat_cols:
    le = LabelEncoder()
    X_cat.append(le.fit_transform(df[col].astype(str)).reshape(-1,1))

if X_cat:
    X_cat = np.hstack(X_cat)
    from scipy.sparse import csr_matrix
    X_cat = csr_matrix(X_cat)
    X = hstack([X_text, X_cat])
else:
    X = X_text

In [4]:
# обучаем 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         All       0.60      0.67      0.64        43
           J       0.72      0.91      0.80       153
           M       0.91      0.20      0.33        50
           S       1.00      0.20      0.33         5
         nan       0.88      0.75      0.81        20

    accuracy                           0.72       271
   macro avg       0.82      0.55      0.58       271
weighted avg       0.75      0.72      0.68       271



In [6]:
df_final = pd.read_excel('aston_origin.xlsx')
df_final["text_all"] = df_final[text_cols].fillna("").agg(" ".join, axis=1)

X_final_text = tfidf.transform(df_final["text_all"])
if X_cat:
    X_final_cat = []
    for i, col in enumerate(cat_cols):
        X_final_cat.append(le.fit_transform(df_final[col].astype(str)).reshape(-1,1))
    X_final_cat = np.hstack(X_final_cat)
    X_final_cat = csr_matrix(X_final_cat)
    X_final = hstack([X_final_text, X_final_cat])
else:
    X_final = X_final_text

df_final["recommended_grade"] = clf.predict(X_final)


In [7]:
df_final['recommended_grade'].value_counts()
#Aston

recommended_grade
J      456
nan     82
M       47
All     29
S        1
Name: count, dtype: int64

Все равно переобучается для определения J

Плюс 82 неопределенных курса 

In [8]:
df['recommended_grade'].value_counts()
#Andersen

recommended_grade
J      764
M      249
All    216
S       22
Name: count, dtype: int64