In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error

In [3]:
df = pd.read_csv(
    "/content/drive/MyDrive/colab/Video_Games_5_part0.csv",
    on_bad_lines="skip"
)
df = df.dropna(subset=["reviewText", "overall"])

X = df["reviewText"].astype(str)
y = df["overall"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
print(X.size, y.size)

471356 471356


1) TF-IDF + LogReg

In [6]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=300,
        n_jobs=-1,
        solver="lbfgs"
    ))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred, digits=4))

              precision    recall  f1-score   support

           1     0.6255    0.5679    0.5953      5828
           2     0.4387    0.1270    0.1970      4566
           3     0.4442    0.2805    0.3438      9309
           4     0.4738    0.2868    0.3574     17682
           5     0.7530    0.9408    0.8365     56887

    accuracy                         0.6905     94272
   macro avg     0.5471    0.4406    0.4660     94272
weighted avg     0.6471    0.6905    0.6521     94272



In [7]:
print("Confusion matrix:\n", confusion_matrix(y_test, pred))

Confusion matrix:
 [[ 3310   359   531   229  1399]
 [  959   580  1265   520  1242]
 [  549   295  2611  2237  3617]
 [  195    48  1071  5072 11296]
 [  279    40   400  2646 53522]]


Дизбаланс классов из-за которых модель слишком часто предсказывает 5

In [8]:
mae = mean_absolute_error(y_test, pred)
print("MAE:", mae)

MAE: 0.45918194161575016
