In [18]:
import polars as pl
df = pl.read_parquet("data/grouped_batch_1.parquet")
df.head()

user_id,item_id,action_type_view_description,action_type_to_cart,action_type_page_view,action_type_favorite,action_type_unfavorite,action_type_review_view,action_type_remove,last_status
i64,i64,i32,i32,i32,i32,i32,i32,i32,i8
4721531,103451030,0,0,1,0,0,0,0,0
4233180,141653295,0,0,1,0,0,0,0,0
4602951,193164064,0,1,0,1,1,0,1,0
1729611,105254873,0,0,1,0,0,0,0,0
507430,80041107,0,0,1,0,0,0,0,0


In [19]:
from sklearn.preprocessing import StandardScaler
import numpy as np

feature_cols = [
    "action_type_view_description",
    "action_type_to_cart",
    "action_type_page_view",
    "action_type_favorite",
    "action_type_unfavorite",
    "action_type_review_view",
    "action_type_remove"
]

X = df.select(feature_cols).to_numpy()
y = df["last_status"].to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

y_pred_gb = model_gb.predict(X_test)

print(classification_report(y_test, y_pred_gb))

KeyboardInterrupt: 

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


model_log_reg = LogisticRegression(
    class_weight={0: 1, 1: 40},     # учитываем дисбаланс
    max_iter=1000,
    C=0.5,                       # сила регуляризации (меньше — сильнее)
    penalty="l2",
    solver="lbfgs"
)
model_log_reg.fit(X_train, y_train)


y_proba = model_log_reg.predict_proba(X_test)[:, 1]


threshold = 0.9
y_pred_custom = (y_proba >= threshold).astype(int)


print(classification_report(y_test, y_pred_custom))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99  14827301
           1       0.35      0.65      0.45    188431

    accuracy                           0.98  15015732
   macro avg       0.67      0.82      0.72  15015732
weighted avg       0.99      0.98      0.98  15015732



In [25]:
y_pred_proba = model_log_reg.predict_proba(X_test)[:, 1]

from sklearn.metrics import f1_score, precision_score, recall_score

best_f1, best_t = 0, 0
for t in [0.1 * i for i in range(1, 10)]:
    y_pred = (y_pred_proba > t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print(f"Лучший порог: {best_t:.2f}, F1: {best_f1:.4f}")

Лучший порог: 0.90, F1: 0.4518


In [None]:
from catboost import CatBoostClassifier


model_cat = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='F1',
    class_weights=[1, 1],  # можно задать вручную: [w0, w1]
    verbose=100
)
model_cat.fit(X_train, y_train, eval_set=(X_test, y_test))


0:	learn: 0.7000081	test: 0.6998447	best: 0.6998447 (0)	total: 483ms	remaining: 8m 2s
100:	learn: 0.6993736	test: 0.6992311	best: 0.6999405 (1)	total: 33.9s	remaining: 5m 1s
200:	learn: 0.6993764	test: 0.6991920	best: 0.6999405 (1)	total: 1m 7s	remaining: 4m 29s
300:	learn: 0.6996745	test: 0.6994762	best: 0.6999405 (1)	total: 1m 41s	remaining: 3m 54s
400:	learn: 0.6996739	test: 0.6994615	best: 0.6999405 (1)	total: 2m 14s	remaining: 3m 20s
500:	learn: 0.6995886	test: 0.6993561	best: 0.6999405 (1)	total: 2m 47s	remaining: 2m 47s
600:	learn: 0.6994948	test: 0.6992271	best: 0.6999405 (1)	total: 3m 25s	remaining: 2m 16s
700:	learn: 0.6995271	test: 0.6992497	best: 0.6999405 (1)	total: 3m 57s	remaining: 1m 41s
800:	learn: 0.6995538	test: 0.6992742	best: 0.6999405 (1)	total: 4m 30s	remaining: 1m 7s
900:	learn: 0.6995666	test: 0.6992622	best: 0.6999405 (1)	total: 5m 5s	remaining: 33.5s
999:	learn: 0.6996110	test: 0.6993038	best: 0.6999405 (1)	total: 5m 37s	remaining: 0us

bestTest = 0.699940530

<catboost.core.CatBoostClassifier at 0x17a133cb0>

In [None]:
# Предсказания классов (порог по умолчанию 0.5)
y_pred_cat = model_cat.predict(X_test)

# Метрики
print(classification_report(y_test, y_pred_cat))

              precision    recall  f1-score   support

           0       0.70      0.12      0.21   1658610
           1       0.55      0.95      0.70   1883608

    accuracy                           0.57   3542218
   macro avg       0.63      0.54      0.46   3542218
weighted avg       0.62      0.57      0.47   3542218



In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

# Определяем модель
model_cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='F1',
    class_weights=[1, 10],  # предварительные веса, можно оставить None
    verbose=0,
    random_seed=42
)

# Гиперпараметры для подбора
param_dist = {
    'iterations': [300, 500, 800, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7, 10],
    'class_weights': [[1, 5], [1, 10], [1, 20]]
}

# F1-мера для миноритарного класса (label=1)
f1_scorer = make_scorer(f1_score, average='binary', pos_label=1)

# RandomizedSearchCV (подбирает случайное подмножество параметров)
random_search = RandomizedSearchCV(
    estimator=model_cat,
    param_distributions=param_dist,
    n_iter=20,  # количество случайных комбинаций
    scoring=f1_scorer,
    cv=3,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Лучшие параметры:", random_search.best_params_)
print("Лучший F1-score:", random_search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Traceback (most recent call last):
  File [35m"/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /var/folders/r0/jlw4c4zs6c540l3_z3d9ch840000gn/T/joblib_memmapping_folder_27980_970ea0c049fc4ffe8ecdac596e9464bb_506577d01ecd409bb9e597bc7c7be11c for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-27980-fd_fppxo for automatic cleanup: unknown resource type semlock[0m
Trac