In [5]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error

In [6]:
df = pd.read_csv(
    "/content/drive/MyDrive/colab/Video_Games_5_part0.csv",
    on_bad_lines="skip"
)
df = df.dropna(subset=["reviewText", "overall"])

class_counts = df['overall'].value_counts()
print(f"Распределение классов:\n{class_counts}")

Распределение классов:
overall
5.0    284435
4.0     88407
3.0     46546
1.0     29138
2.0     22830
Name: count, dtype: int64


In [7]:
X = df["reviewText"].astype(str)
y = df["overall"].astype(int)

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.15 / 0.85,
    random_state=42,
    stratify=y_temp
)
print("Done")

Done


1) TF-IDF + LogReg

In [8]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=300,
        n_jobs=-1,
        solver="lbfgs"
    ))
])

model.fit(X_train, y_train)

In [9]:
def prediction_report(model, X, y):
  pred = model.predict(X)
  print(classification_report(y, pred, digits=4))
  print("Confusion matrix:\n", confusion_matrix(y, pred))
  print("MAE:", mean_absolute_error(y, pred))

In [10]:
prediction_report(model, X_val, y_val)

              precision    recall  f1-score   support

           1     0.6283    0.5511    0.5872      4371
           2     0.3925    0.1220    0.1862      3425
           3     0.4365    0.2627    0.3280      6982
           4     0.4664    0.2801    0.3500     13261
           5     0.7497    0.9425    0.8351     42665

    accuracy                         0.6872     70704
   macro avg     0.5347    0.4317    0.4573     70704
weighted avg     0.6408    0.6872    0.6473     70704

Confusion matrix:
 [[ 2409   327   415   179  1041]
 [  664   418   980   388   975]
 [  379   247  1834  1719  2803]
 [  155    51   735  3715  8605]
 [  227    22   238  1965 40213]]
MAE: 0.464683751980086


Дизбаланс классов из-за которых модель слишком часто предсказывает 5. Добавлю веса

In [11]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=300,
        class_weight='balanced',
        n_jobs=-1,
        solver="lbfgs"
    ))
])

model.fit(X_train, y_train)
prediction_report(model, X_val, y_val)

              precision    recall  f1-score   support

           1     0.4911    0.6346    0.5537      4371
           2     0.2847    0.3130    0.2982      3425
           3     0.3492    0.4055    0.3753      6982
           4     0.3921    0.4716    0.4282     13261
           5     0.8570    0.7479    0.7988     42665

    accuracy                         0.6342     70704
   macro avg     0.4748    0.5145    0.4908     70704
weighted avg     0.6693    0.6342    0.6481     70704

Confusion matrix:
 [[ 2774   833   379   147   238]
 [  859  1072   990   296   208]
 [  645   966  2831  1662   878]
 [  413   444  2151  6254  3999]
 [  957   450  1755  7592 31911]]
MAE: 0.5131251414347138


Получилось лучше. Запущу на тестовой выборке

In [17]:
prediction_report(model, X_test, y_test)

              precision    recall  f1-score   support

           1     0.5003    0.6568    0.5680      4371
           2     0.2919    0.3245    0.3073      3424
           3     0.3536    0.4092    0.3794      6982
           4     0.3918    0.4697    0.4272     13261
           5     0.8554    0.7454    0.7966     42666

    accuracy                         0.6346     70704
   macro avg     0.4786    0.5211    0.4957     70704
weighted avg     0.6696    0.6346    0.6483     70704

Confusion matrix:
 [[ 2871   715   389   159   237]
 [  877  1111   921   284   231]
 [  654   954  2857  1630   887]
 [  418   476  2116  6229  4022]
 [  918   550  1797  7598 31803]]
MAE: 0.5162508486082824
