# AutoML - модели

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
from _funcs import transform_frame, feature_creator, image_path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
from scipy import stats

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from autogluon.tabular import TabularPredictor
from autogluon.multimodal import MultiModalPredictor
from autogluon.features import FeatureMetadata

In [None]:
train = pd.read_csv("ml_ozon_counterfeit_train.csv")
X_test = pd.read_csv("ml_ozon_counterfeit_test.csv")

In [None]:
y_train = train["resolution"]
X_train = train.drop("resolution", axis=1)
X_train["brand_name"].isna().sum()

In [None]:
X_train_new, X_val, y_train_new, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)
X_train_new = feature_creator(transform_frame(X_train_new))
X_val = feature_creator(transform_frame(X_val))

# --- Train / Test split ---
train_data = pd.concat(
    [X_train_new.reset_index(drop=True), y_train_new.reset_index(drop=True)], axis=1
)
val_data = pd.concat(
    [X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1
)

feature_metadata_dict = {
    "float": [
        "rating_1_count",
        "rating_2_count",
        "rating_3_count",
        "rating_4_count",
        "rating_5_count",
        "comments_published_count",
        "photos_published_count",
        "videos_published_count",
        "PriceDiscounted",
        "item_count_fake_returns7",
        "item_count_fake_returns30",
        "item_count_fake_returns90",
        "item_count_sales7",
        "item_count_sales30",
        "item_count_sales90",
        "item_count_returns7",
        "item_count_returns30",
        "item_count_returns90",
        "GmvTotal7",
        "GmvTotal30",
        "GmvTotal90",
        "ExemplarAcceptedCountTotal7",
        "ExemplarAcceptedCountTotal30",
        "ExemplarAcceptedCountTotal90",
        "OrderAcceptedCountTotal7",
        "OrderAcceptedCountTotal30",
        "OrderAcceptedCountTotal90",
        "ExemplarReturnedCountTotal7",
        "ExemplarReturnedCountTotal30",
        "ExemplarReturnedCountTotal90",
        "ExemplarReturnedValueTotal7",
        "ExemplarReturnedValueTotal30",
        "ExemplarReturnedValueTotal90",
        "ItemVarietyCount",
        "ItemAvailableCount",
        "is_description",
        "name_word_count",  # Убрали 'SellerID' отсюда
        "desc_word_count",
        "name_length",
        "name_caps_count",
        "name_caps_ratio",
        "desc_unique_ratio",
        "return_to_sales_ratio_30",
        "return_value_to_gmv_ratio_30",
        "suspicious_return_ratio_30",
        "order_to_sales_ratio_30",
        "avg_return_value_30",
        "share_return_count30",
        "share_fake_sales30",
        "rating_amount",
        "mean_rating",
        "rating_var",
        "median_price_discount",
        "mean_price_30",
        "log_time_seller",
        "log_time_item",
        "mean_seller_rating",
        "mean_count_items",
    ],
    "category": [
        "SellerID",
        "brand_name",
        "CommercialTypeName4",
    ],  # 'SellerID' только здесь
    "text": ["name_rus", "description"],
}

# Создаем объект FeatureMetadata
feature_metadata = FeatureMetadata(feature_metadata_dict)

X_train_new, X_val, y_train_new, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)
X_train_new = feature_creator_old(transform_frame_old(X_train_new))
X_val = feature_creator_old(transform_frame_old(X_val))

# --- Train / Test split ---
train_data = pd.concat(
    [X_train_new.reset_index(drop=True), y_train_new.reset_index(drop=True)], axis=1
)
val_data = pd.concat(
    [X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1
)

hyperparameters = {
    "NN_TORCH": {"num_epochs": 5},
    "GBM": {"num_boost_round": 1000},
    "XGB": {"n_estimators": 1000},
}

# --- АвтоML классификация ---
predictor = TabularPredictor(
    label="resolution", problem_type="binary", eval_metric="f1"
).fit(
    train_data=train_data,  # часть данных для обучения
    use_bag_holdout=True,
    presets="best_quality",
    num_bag_folds=5,
    time_limit=42_000,
    num_stack_levels=0,
)

# --- Оценка на тесте ---(лучше без валидационной выборки в fit, чтобы не переобучалось)
performance = predictor.evaluate(val_data)

# --- Предсказания ---
y_pred = predictor.predict(val_data)
performance