In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # :contentReference[oaicite:6]{index=6}

def rmse_metric(y_true, y_pred):
        return np.sqrt(mean_squared_error(y_true, y_pred)) 

RANDOM_STATE = 42
TRAIN_PATH = "Video_Games.csv"
TEST_PATH  = "Video_Games_Test.csv"
TARGET = "JP_Sales"

train_df = pd.read_csv(TRAIN_PATH)

y = train_df[TARGET].astype(float)
X = train_df.drop(columns=[TARGET], errors="ignore").copy()

# как в lab_fixed: 'Name' почти уникален — его обычно разумно убрать
if "Name" in X.columns:
    X = X.drop(columns=["Name"])

# User_Score иногда строка ('tbd') → в число
if "User_Score" in X.columns:
    X["User_Score"] = pd.to_numeric(X["User_Score"], errors="coerce")

# ВАЖНО: НЕ удаляем Platform/Genre/Publisher/Developer/Rating — они останутся как категориальные (dtype object)
print("X shape:", X.shape)


X shape: (11703, 13)


In [29]:
from catboost import CatBoostRegressor

def make_cat_features_indices(X_df: pd.DataFrame):
    cat_cols = [c for c in X_df.columns if X_df[c].dtype == "object"]
    cat_idx = [X_df.columns.get_loc(c) for c in cat_cols]
    return cat_cols, cat_idx

def cv_oof_catboost(X_df: pd.DataFrame, y: pd.Series, cv: KFold):
    cat_cols, cat_idx = make_cat_features_indices(X_df)

    oof = np.zeros(len(y), dtype=float)
    best_iters = []
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_df, y), 1):
        X_tr = X_df.iloc[tr_idx].copy()
        X_va = X_df.iloc[va_idx].copy()
        y_tr = y.iloc[tr_idx]
        y_va = y.iloc[va_idx]

        # cat missing → строка (устойчивее)
        for c in cat_cols:
            X_tr[c] = X_tr[c].fillna("__MISSING__")
            X_va[c] = X_va[c].fillna("__MISSING__")

        model = CatBoostRegressor(
            loss_function="MAE",
            iterations=20000,
            learning_rate=0.025,
            depth=9,
            l2_leaf_reg=6,
            random_strength=1.0,
            bootstrap_type="Bayesian",
            bagging_temperature=1.0,
            random_seed=RANDOM_STATE,
            verbose=False,
        )

        model.fit(
            X_tr, y_tr,
            cat_features=cat_idx,
            eval_set=(X_va, y_va),
            use_best_model=True,
            early_stopping_rounds=500
        )

        pred = model.predict(X_va)
        pred = np.clip(pred, 0, None)

        oof[va_idx] = pred
        mae = mean_absolute_error(y_va, pred)
        rmse = rmse_metric(y_va, pred)
        fold_scores.append((mae, rmse))
        best_iters.append(model.get_best_iteration())

        print(f"[CatBoost][fold {fold}] MAE={mae:.6f} RMSE={rmse:.6f} best_iter={best_iters[-1]}")

    return oof, fold_scores, best_iters


In [30]:


def cv_oof_sklearn(estimator: Pipeline, X_df: pd.DataFrame, y: pd.Series, cv: KFold):
    oof = np.zeros(len(y), dtype=float)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_df, y), 1):
        est = clone(estimator)
        est.fit(X_df.iloc[tr_idx], y.iloc[tr_idx])
        pred = est.predict(X_df.iloc[va_idx])
        pred = np.clip(pred, 0, None)

        oof[va_idx] = pred
        mae = mean_absolute_error(y.iloc[va_idx], pred)
        rmse = rmse_metric(y.iloc[va_idx], pred)  # вместо mean_squared_error(..., squared=False)
        fold_scores.append((mae, rmse))

        print(f"[Ridge][fold {fold}] MAE={mae:.6f} RMSE={rmse:.6f}")

    return oof, fold_scores


def summarize(scores, name):
    maes = np.array([s[0] for s in scores], dtype=float)
    rmses = np.array([s[1] for s in scores], dtype=float)
    print(f"{name}: MAE mean={maes.mean():.6f} std={maes.std():.6f} | RMSE mean={rmses.mean():.6f} std={rmses.std():.6f}")


In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def make_ridge_pipe(X_df, alpha=2.0, min_freq=50):
    # категории: object/category/bool (при необходимости добавьте сюда int-коды, если они означают категории)
    cat_cols = X_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = [c for c in X_df.columns if c not in cat_cols]

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    # min_frequency есть не во всех версиях sklearn → fallback
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=min_freq)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore")

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", ohe),
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
    )

    return Pipeline([
        ("pre", pre),
        ("model", Ridge(alpha=alpha)),
    ])


In [32]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)  # :contentReference[oaicite:14]{index=14}

ridge_pipe = make_ridge_pipe(X, alpha=2.0, min_freq=50)
oof_ridge, ridge_scores = cv_oof_sklearn(ridge_pipe, X, y, cv)
summarize(ridge_scores, "Ridge")


oof_cb, cb_scores, cb_best_iters = cv_oof_catboost(X, y, cv)
summarize(cb_scores, "CatBoost")

# Подбор веса ансамбля по MAE на OOF
weights = np.linspace(0, 1, 201)  # w = доля CatBoost
best = {"w": None, "mae": np.inf, "rmse": np.inf}

for w in weights:
    ens = w * oof_cb + (1 - w) * oof_ridge
    mae = mean_absolute_error(y, ens)
    rmse = rmse_metric(y, ens)
    if mae < best["mae"]:
        best = {"w": float(w), "mae": float(mae), "rmse": float(rmse)}

print("Best ensemble (by MAE):", best)


[Ridge][fold 1] MAE=0.080631 RMSE=0.204857
[Ridge][fold 2] MAE=0.086742 RMSE=0.258789
[Ridge][fold 3] MAE=0.081969 RMSE=0.219856
[Ridge][fold 4] MAE=0.086440 RMSE=0.288133
[Ridge][fold 5] MAE=0.083084 RMSE=0.249227
Ridge: MAE mean=0.083773 std=0.002430 | RMSE mean=0.244172 std=0.029355
[CatBoost][fold 1] MAE=0.045963 RMSE=0.178358 best_iter=741
[CatBoost][fold 2] MAE=0.055877 RMSE=0.228941 best_iter=1358
[CatBoost][fold 3] MAE=0.049261 RMSE=0.192108 best_iter=1298
[CatBoost][fold 4] MAE=0.055732 RMSE=0.250366 best_iter=692
[CatBoost][fold 5] MAE=0.056192 RMSE=0.235533 best_iter=370
CatBoost: MAE mean=0.052605 std=0.004211 | RMSE mean=0.217061 std=0.027248
Best ensemble (by MAE): {'w': 1.0, 'mae': 0.05260450844678675, 'rmse': 0.21876032939569653}


In [None]:
from sklearn.model_selection import train_test_split

test_df = pd.read_csv(TEST_PATH)
test_X = test_df.drop(columns=[c for c in [TARGET] if c in test_df.columns], errors="ignore").copy()

# как в train: drop Name
if "Name" in test_X.columns:
    test_X = test_X.drop(columns=["Name"])

# User_Score -> numeric
if "User_Score" in test_X.columns:
    test_X["User_Score"] = pd.to_numeric(test_X["User_Score"], errors="coerce")

test_X = test_X.reindex(columns=X.columns)

# 1) Ridge full
ridge_pipe_final = make_ridge_pipe(X, alpha=2.0, min_freq=50)
ridge_pipe_final.fit(X, y)
pred_ridge = np.clip(ridge_pipe_final.predict(test_X), 0, None)

# 2) CatBoost final с holdout
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.10, random_state=RANDOM_STATE)

cat_cols = [c for c in ["Platform","Genre","Publisher","Developer","Rating"] if c in X_tr.columns]

cat_cols = [c for c in ["Platform","Genre","Publisher","Developer","Rating"] if c in X_tr.columns]
for c in cat_cols:
    X_tr[c] = X_tr[c].astype("string").fillna("__MISSING__")
    X_va[c] = X_va[c].astype("string").fillna("__MISSING__")
    test_X[c] = test_X[c].astype("string").fillna("__MISSING__")

cb_final = CatBoostRegressor(
    loss_function="MAE",
    iterations=30000,
    learning_rate=0.025,
    depth=9,
    l2_leaf_reg=6,
    random_strength=1.0,
    bootstrap_type="Bayesian",
    bagging_temperature=1.0,
    random_seed=RANDOM_STATE,
    verbose=False,
)

cb_final.fit(
    X_tr, y_tr,
    cat_features=cat_idx,
    eval_set=(X_va, y_va),
    use_best_model=True,
    early_stopping_rounds=500
)



print("Train cols:", list(X.columns))
print("Test cols :", list(test_X.columns))

extra = sorted(set(test_X.columns) - set(X.columns))
missing = sorted(set(X.columns) - set(test_X.columns))
print("Extra in test:", extra)
print("Missing in test:", missing)




Train cols: ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'Other_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']
Test cols : ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'Other_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']
Extra in test: []
Missing in test: []


In [26]:
pred_cb = np.clip(cb_final.predict(test_X), 0, None)
# 3) Ensemble
w = best["w"]  # вес CatBoost, найденный по OOF
pred_ens = np.clip(w * pred_cb + (1 - w) * pred_ridge, 0, None)

# 4) Submission
if "Id" in test_df.columns:
    sub = pd.DataFrame({"Id": test_df["Id"], "JP_Sales": pred_ens})
else:
    sub = pd.DataFrame({"Id": np.arange(1, len(test_df) + 1), "JP_Sales": pred_ens})

sub.to_csv("sub.csv", index=False)
sub.head()


Unnamed: 0,Id,JP_Sales
0,1,0.069099
1,2,0.0
2,3,0.000793
3,4,0.001018
4,5,0.000515


In [None]:
import time
import numpy as np
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

RANDOM_STATE = 42

# ВАЖНО: X не должен содержать Id; test отдельно.
# Категориальные признаки (у вас основные)
cat_cols = [c for c in ["Platform","Genre","Publisher","Developer","Rating"] if c in X.columns]

# Фиксируем holdout (быстро для тюнинга)
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.15, random_state=RANDOM_STATE
)

# Убедимся, что категории строки + заполнены пропуски
for c in cat_cols:
    X_tr[c] = X_tr[c].astype("string").fillna("__MISSING__")
    X_va[c] = X_va[c].astype("string").fillna("__MISSING__")

def objective(trial: optuna.Trial) -> float:
    params = {
        "loss_function": "MAE",
        "eval_metric": "MAE",

        # GPU
        "task_type": "GPU",
        "devices": "0",  # если несколько GPU: "0:1" и т.п.

        # потолок по деревьям; реальная длина будет определена early stopping
        "iterations": 20000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.13),
        "depth": trial.suggest_int("depth", 6, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2.0, 30.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.0, 3.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 3.0),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        "max_ctr_complexity": trial.suggest_int("max_ctr_complexity", 1, 4),

        "random_seed": RANDOM_STATE,
        "verbose": False,
        "allow_writing_files": False,
    }

    model = CatBoostRegressor(**params)
    model.fit(
        X_tr, y_tr,
        cat_features=cat_cols,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=300,
        # callbacks НЕ добавляем: на GPU запрещено :contentReference[oaicite:4]{index=4}
    )

    pred = np.clip(model.predict(X_va), 0, None)
    return mean_absolute_error(y_va, pred)

TIME_BUDGET_MIN = 40
study = optuna.create_study(direction="minimize")
study.optimize(
    objective,
    timeout=TIME_BUDGET_MIN * 60,  # секунды :contentReference[oaicite:5]{index=5}
    n_trials=10_000,         
    n_jobs=1                       # один процесс на один GPU
)

print("Best MAE:", study.best_value)
print("Best params:", study.best_params)


[I 2026-01-03 16:56:36,581] A new study created in memory with name: no-name-5d5910eb-ca2b-4762-9787-e48089ce5d52


Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-01-03 16:56:43,886] Trial 0 finished with value: 0.04738677754569058 and parameters: {'learning_rate': 0.05036938347560503, 'depth': 6, 'l2_leaf_reg': 8.393750825598254, 'random_strength': 0.9727182547490472, 'bagging_temperature': 2.2695951103585745, 'one_hot_max_size': 11, 'max_ctr_complexity': 2}. Best is trial 0 with value: 0.04738677754569058.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-01-03 16:57:17,081] Trial 1 finished with value: 0.04589400961988025 and parameters: {'learning_rate': 0.02815592888411366, 'depth': 9, 'l2_leaf_reg': 13.891904008565607, 'random_strength': 1.0817576370453468, 'bagging_temperature': 1.5437781112478863, 'one_hot_max_size': 20, 'max_ctr_complexity': 4}. Best is trial 1 with value: 0.04589400961988025.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-01-03 16:57:48,454] Trial 2 finished with value: 0.053037977722

Best MAE: 0.04589400961988025
Best params: {'learning_rate': 0.02815592888411366, 'depth': 9, 'l2_leaf_reg': 13.891904008565607, 'random_strength': 1.0817576370453468, 'bagging_temperature': 1.5437781112478863, 'one_hot_max_size': 20, 'max_ctr_complexity': 4}
