# Модель на основе CatboostClassifier (лучший результат на валидационной и тестовой выборках)

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
from _funcs import transform_frame, feature_creator, image_path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool, cv

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import f_classif

In [None]:
df = pd.read_csv("ml_ozon_counterfeit_train.csv", encoding="utf-8")
df_upd = feature_creator(transform_frame(df))
df_upd.columns

In [None]:
cr = df_upd.drop(
    columns=["brand_name", "description", "name_rus", "CommercialTypeName4"]
).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(cr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# X - ваши фичи, y - таргет
X = df_upd.drop(
    columns=[
        "brand_name",
        "description",
        "name_rus",
        "CommercialTypeName4",
        "resolution",
    ]
)
y = df_upd["resolution"]

# Вычисляем F-статистику и p-values
f_scores, p_values = f_classif(X, y)

# Создаем DataFrame для наглядности
f_test_df = pd.DataFrame(
    {"feature": X.columns.tolist(), "f_score": f_scores, "p_value": p_values}
).sort_values("f_score", ascending=False)

print(f_test_df)

In [None]:
cats = ["brand_name", "CommercialTypeName4", "SellerID"]
texts = ["description", "name_rus"]

# Отделяем фичи и таргет
X = df_upd.drop(
    columns=[
        "resolution",
        "comments_published_count",
        "rating_amount",
        "videos_published_count",
        "name_excl_quest_count",
        "photos_published_count",
        "name_length",
    ]
)
y = df_upd["resolution"]

# Разделяем train/test (например, 80/20)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=567, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=112, stratify=y_temp
)

In [None]:
train_pool = Pool(X_train, y_train, cat_features=cats, text_features=texts)
val_pool = Pool(X_val, y_val, cat_features=cats, text_features=texts)
test_pool = Pool(X_test, cat_features=cats, text_features=texts)
temp_pool = Pool(X_temp, cat_features=cats, text_features=texts)

In [None]:
# Параметры для кросс-валидации
parameters = {
    "iterations": 1000,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 3,
    "random_strength": 1,
    "bagging_temperature": 0.8,
    "border_count": 254,
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "random_seed": 42,
    "verbose": False,
    "early_stopping_rounds": 50,
}

# Кросс-валидация
cv_data = cv(
    pool=train_pool,
    params=parameters,
    fold_count=5,  # 5-fold cross-validation
    shuffle=True,
    stratified=True,
    partition_random_seed=42,
    plot=False,
)

# Лучший результат кросс-валидации
best_f1 = cv_data["test-F1-mean"].max()
print(f"Best CV F1-score: {best_f1}")

In [None]:
def objective(trial):
    parameters_opt = {
        "iterations": trial.suggest_int("iterations", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 0.5, 2),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]
        ),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "early_stopping_rounds": 50,
        "verbose": False,
        "random_seed": 42,
        "eval_metric": "F1",
        "loss_function": "Logloss",
    }

    model = CatBoostClassifier(**parameters_opt)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True, verbose=False)

    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)


# Оптимизация гиперпараметров
study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=15, timeout=7200)  # 50 trials или 1 час

print("Best parameters:", study.best_params)
print("Best F1-score:", study.best_value)

In [None]:
# Лучшие параметры из подбора
best_params = study.best_params
best_params.update(
    {
        "loss_function": "Logloss",
        "eval_metric": "F1",
        "random_seed": 42,
        "early_stopping_rounds": 50,
        "use_best_model": True,
    }
)

# Финальная модель
final_model = CatBoostClassifier(**best_params)
final_model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=100,
    # plot=True
)

# Оценка на тестовой выборке
y_pred_train = final_model.predict(train_pool)
y_pred_test = final_model.predict(test_pool)

print(classification_report(y_train, y_pred_train))
print()
print(classification_report(y_test, y_pred_test))

# Предсказание вероятностей
y_pred_proba = final_model.predict_proba(test_pool)[:, 1]

In [None]:
grid = {
    "iterations": [1000],
    "depth": [6, 8, 10],
    "learning_rate": [0.03, 0.1, 0.2],
    "l2_leaf_reg": [3, 5, 7],
    "bagging_temperature": [0, 1, 5],
}

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="F1",
    class_weights=class_weights,
    random_seed=343,
    verbose=200,
)

grid_search_result = model.grid_search(
    grid, train_pool, cv=3, stratified=True, shuffle=True, verbose=True
)

In [None]:
model = CatBoostClassifier(eval_metric="F1")
model.fit(train_pool, eval_set=val_pool, verbose=100, use_best_model=True)

In [None]:
pred_train = model.predict(train_pool)
pred_test = model.predict(test_pool)

print(classification_report(y_train, pred_train))
print()
print(classification_report(y_test, pred_test))

In [None]:
dfa = pd.read_csv("ml_ozon_сounterfeit_test.csv", encoding="utf-8")

In [None]:
dfa_upd = feature_creator(transform_frame(dfa))

In [None]:
old_price = df_upd["PriceDiscounted"].mean()
new_price = dfa_upd["PriceDiscounted"].mean()
dfa_upd["PriceDiscounted"] = dfa_upd["PriceDiscounted"] * old_price / new_price

In [None]:
Xts = dfa_upd.drop(
    columns=[
        "comments_published_count",
        "rating_amount",
        "videos_published_count",
        "name_excl_quest_count",
        "photos_published_count",
        "name_length",
    ]
)
test_pool_new = Pool(Xts, cat_features=cats, text_features=texts)

In [None]:
pred_test_new = model.predict(test_pool_new)

In [None]:
write = pd.DataFrame()
write["id"] = dfa["id"]
write["prediction"] = pred_test_new
write.to_csv("example.csv", index=False)

In [None]:
write["prediction"].sum() / len(write)

In [None]:
probs_final = model.predict_proba(test_pool_new)[:, 1]

custom_threshold = 0.45

custom_predictions_final = (probs_final >= custom_threshold).astype(int)

In [None]:
write = pd.DataFrame()
write["id"] = dfa["id"]
write["prediction"] = custom_predictions_final
write.to_csv("examplenew.csv", index=False)

In [None]:
write["prediction"].sum() / len(write)

In [None]:
df["PriceDiscounted"].mean(), dfa["PriceDiscounted"].mean()