# Video Games JP_Sales: исправленная версия (Name + franchise features)


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy import sparse
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

from catboost import CatBoostRegressor

RANDOM_STATE = 42
TRAIN_PATH = "Video_Games.csv"
TEST_PATH  = "Video_Games_Test.csv"
TARGET = "JP_Sales"

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# на случай "Id " / " id" / пробелов
train_df.columns = train_df.columns.str.strip()
test_df.columns  = test_df.columns.str.strip()

assert TARGET in train_df.columns, f"'{TARGET}' not found in train"

y = train_df[TARGET].astype(float)
X_train_raw = train_df.drop(columns=[TARGET]).copy()
X_test_raw  = test_df.copy()

def find_id_col(df):
    for c in df.columns:
        if str(c).strip().lower() == "id":
            return c
    return None

id_col_train = find_id_col(X_train_raw)
id_col_test  = find_id_col(X_test_raw)

if id_col_test is not None:
    test_ids = X_test_raw[id_col_test].values
else:
    test_ids = np.arange(1, len(X_test_raw) + 1)

if id_col_train is not None:
    X_train_raw.drop(columns=[id_col_train], inplace=True)
if id_col_test is not None:
    X_test_raw.drop(columns=[id_col_test], inplace=True)

X_all = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)

print("Train rows:", len(X_train_raw), "Test rows:", len(X_test_raw), "All rows:", len(X_all))
print("Columns:", list(X_all.columns))

Train rows: 11703 Test rows: 5016 All rows: 16719
Columns: ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'Other_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']


In [2]:
print(X_train_raw.head(10))
print(X_test_raw.head(10))
print('*' * 10)


                                  Name Platform  Year_of_Release  \
0                      Rapala Trophies      PSP           2006.0   
1              New Super Mario Bros. U     WiiU           2012.0   
2                               Robots      PS2           2005.0   
3                       Hamster Club 3      GBA           2002.0   
4                         Formula 1 06      PS2           2006.0   
5                     My Ballet Studio      Wii           2009.0   
6                           EVE Online       PC           2003.0   
7  S.T.A.L.K.E.R.: Shadow of Chernobyl       PC           2007.0   
8                      Madden NFL 2003       XB           2002.0   
9              Shin Super Robot Taisen       PS           1996.0   

          Genre                    Publisher  NA_Sales  EU_Sales  Other_Sales  \
0        Sports                   Activision      0.04      0.00         0.00   
1      Platform                     Nintendo      2.30      1.34         0.32   
2       

In [3]:
print("X_all")
print(X_all.head(10))

X_all
                                  Name Platform  Year_of_Release  \
0                      Rapala Trophies      PSP           2006.0   
1              New Super Mario Bros. U     WiiU           2012.0   
2                               Robots      PS2           2005.0   
3                       Hamster Club 3      GBA           2002.0   
4                         Formula 1 06      PS2           2006.0   
5                     My Ballet Studio      Wii           2009.0   
6                           EVE Online       PC           2003.0   
7  S.T.A.L.K.E.R.: Shadow of Chernobyl       PC           2007.0   
8                      Madden NFL 2003       XB           2002.0   
9              Shin Super Robot Taisen       PS           1996.0   

          Genre                    Publisher  NA_Sales  EU_Sales  Other_Sales  \
0        Sports                   Activision      0.04      0.00         0.00   
1      Platform                     Nintendo      2.30      1.34         0.32   
2 

In [4]:
import re
import pandas as pd

EDITION_RE = re.compile(
    r"\b(remaster(ed)?|hd|definitive|ultimate|complete|collector'?s|"
    r"game of the year|goty|gold|deluxe|premium|special|limited|edition|"
    r"director'?s cut|anniversary|bundle|collection)\b",
    flags=re.IGNORECASE
)

ROMAN_RE = re.compile(r"\b(i{1,3}|iv|v|vi{0,3}|ix|x|xi|xii|xiii|xiv|xv)\b", flags=re.IGNORECASE)

def normalize_name(s: pd.Series) -> pd.Series:
    s = s.astype("string").fillna("__MISSING__").str.lower()
    # unify separators
    s = s.str.replace(r"[™®©]", "", regex=True)
    s = s.str.replace(r"[\(\)\[\]\{\}]", " ", regex=True)
    s = s.str.replace(r"[/:;,\.\!\?\|\\]", " ", regex=True)
    s = s.str.replace(r"[-_]+", " ", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s.replace("", "__MISSING__")

def split_base(s: pd.Series) -> pd.Series:
    # base before ':' or long dash patterns (common subtitle separators)
    s2 = s.str.replace(r"\s*:\s*", " : ", regex=True)
    # split on ":" or " - " (keep left)
    base = s2.str.split(r"\s:\s|\s-\s|\s—\s", n=1, expand=True)[0]
    base = base.str.strip()
    return base.replace("", "__MISSING__")

def franchise_key(s: pd.Series) -> pd.Series:
    s = s.copy()
    s = s.str.replace(EDITION_RE, " ", regex=True)
    s = s.str.replace(ROMAN_RE, " ", regex=True)
    s = s.str.replace(r"\b\d+\b", " ", regex=True)          # sequel numbers / years in title
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s.replace("", "__MISSING__")

# usage (на X_all)
if "Name" in X_all.columns:
    X_all["Name_norm"] = normalize_name(X_all["Name"])
    X_all["Name_base"] = split_base(X_all["Name_norm"])
    X_all["Franchise_key"] = franchise_key(X_all["Name_base"])


def add_name_flags(df):
    s = df["Name_norm"].astype("string")
    df["name_len"] = s.str.len().fillna(0).astype(int)
    df["name_words"] = s.str.split().str.len().fillna(0).astype(int)
    df["has_colon_or_dash"] = s.str.contains(r"\s:\s|\s-\s|\s—\s", regex=True).astype(int)
    df["has_digit"] = s.str.contains(r"\d").astype(int)
    df["has_roman"] = s.str.contains(ROMAN_RE, regex=True).astype(int)
    df["has_edition_word"] = s.str.contains(EDITION_RE, regex=True).astype(int)
    return df

if "Name_norm" in X_all.columns:
    X_all = add_name_flags(X_all)


  df["has_roman"] = s.str.contains(ROMAN_RE, regex=True).astype(int)
  df["has_edition_word"] = s.str.contains(EDITION_RE, regex=True).astype(int)


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

def add_oof_mean_count_features(
    X: pd.DataFrame,
    y,
    X_test: pd.DataFrame,
    keys,
    n_splits=5,
    seed=42,
    alpha=5.0,          # smoothing strength; 0 = без сглаживания
    fill_value="__MISSING__"
):
    """
    Добавляет OOF признаки:
      - <key>__jp_mean  : OOF mean (optionally smoothed)
      - <key>__jp_cnt   : OOF count
    и для test — статистики по full train.
    """
    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = np.asarray(y).reshape(-1)
    else:
        y = np.asarray(y).reshape(-1)

    assert len(X) == len(y), "X and y must have same length"

    X = X.reset_index(drop=True).copy()
    X_test = X_test.reset_index(drop=True).copy()

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    global_mean = float(np.mean(y))

    def _compute_for_key(key_col: str):
        # гарантируем отсутствие NA в ключе
        X[key_col] = X[key_col].astype("string").fillna(fill_value)
        X_test[key_col] = X_test[key_col].astype("string").fillna(fill_value)

        oof_mean = np.full(len(X), global_mean, dtype=float)
        oof_cnt  = np.zeros(len(X), dtype=float)

        for tr_idx, va_idx in kf.split(X):
            tr_keys = X.loc[tr_idx, key_col].values

            stats = (
                pd.DataFrame({key_col: tr_keys, "y": y[tr_idx]})
                .groupby(key_col)["y"]
                .agg(["mean", "count"])
            )

            va_keys = X.loc[va_idx, key_col]
            m = va_keys.map(stats["mean"])
            c = va_keys.map(stats["count"])

            m = m.fillna(global_mean).astype(float).values
            c = c.fillna(0).astype(float).values

            if alpha and alpha > 0:
                # smoothed mean: (m*c + global*alpha)/(c+alpha)
                m = (m * c + global_mean * alpha) / (c + alpha)

            oof_mean[va_idx] = m
            oof_cnt[va_idx]  = c

        # full-train stats for test
        full_stats = (
            pd.DataFrame({key_col: X[key_col].values, "y": y})
            .groupby(key_col)["y"]
            .agg(["mean", "count"])
        )

        te_m = X_test[key_col].map(full_stats["mean"]).fillna(global_mean).astype(float).values
        te_c = X_test[key_col].map(full_stats["count"]).fillna(0).astype(float).values
        if alpha and alpha > 0:
            te_m = (te_m * te_c + global_mean * alpha) / (te_c + alpha)

        X[f"{key_col}__jp_mean"] = oof_mean
        X[f"{key_col}__jp_cnt"]  = oof_cnt
        X_test[f"{key_col}__jp_mean"] = te_m
        X_test[f"{key_col}__jp_cnt"]  = te_c

    for k in keys:
        if k in X.columns and k in X_test.columns:
            _compute_for_key(k)
        else:
            print(f"SKIP '{k}': not present in both X and X_test")

    return X, X_test


In [6]:
# PREPROCESS ON ALL DATA (train+test), THEN SPLIT BACK
# =========================

def make_ohe(min_freq: int):
    """Совместимость со sklearn: sparse_output (новый) vs sparse (старый)."""
    try:
        return OneHotEncoder(handle_unknown="ignore", min_frequency=min_freq, sparse_output=True)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", min_frequency=min_freq, sparse=True)

def preprocess_all_data(
    X_all_df: pd.DataFrame,
    text_cols,
    cat_cols,
    num_cols,
    min_freq: int = 50,
    tfidf_max_features: int = 80000,
):
    mats = []

    # 1) NUM
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy="median")
        num_scaler = StandardScaler(with_mean=False)
        num_data = num_scaler.fit_transform(num_imputer.fit_transform(X_all_df[num_cols]))
        mats.append(sparse.csr_matrix(num_data))
        print(f"NUM: {len(num_cols)} cols -> {num_data.shape}")

    # 2) CAT (ИСПРАВЛЕНО!)
    if len(cat_cols) > 0:
        cat_df = X_all_df[cat_cols].fillna("__MISSING__").astype(str)
        ohe = make_ohe(min_freq)
        cat_ohe = ohe.fit_transform(cat_df)
        mats.append(cat_ohe.tocsr())
        print(f"CAT: {len(cat_cols)} cols -> OHE shape {cat_ohe.shape}")

    # 3) TEXT (TF-IDF)
    def _flatten_1d(x):
        arr = np.asarray(x).ravel().astype(str)
        return np.where((arr == 'nan') | (arr == 'None') | (arr == '<NA>'), '', arr)

    for c in text_cols:
        # ИСПРАВЛЕНИЕ: fillna ПЕРЕД astype
        text_data = X_all_df[c].fillna("").astype(str).values
        text_data = _flatten_1d(text_data.reshape(-1, 1))
        
        tfidf = TfidfVectorizer(
            analyzer="char",
            ngram_range=(3, 5),
            min_df=2,
            max_features=tfidf_max_features,
        )
        tfidf_mat = tfidf.fit_transform(text_data)
        mats.append(tfidf_mat.tocsr())
        print(f"TEXT '{c}': TF-IDF shape {tfidf_mat.shape}, vocab={len(tfidf.vocabulary_)}")

    # stack
    X_proc = sparse.hstack(mats, format="csr")
    print(f"TOTAL features: {X_proc.shape[1]}")
    return X_proc

# Выбираем колонки
text_cols = [c for c in ["Name", "Name_root"] if c in X_all.columns]

cat_cols_all = X_all.select_dtypes(include=["object", "category", "bool", "string"]).columns.tolist()
cat_cols = [c for c in cat_cols_all if c not in set(text_cols)]

num_cols = [c for c in X_all.columns if c not in set(cat_cols) and c not in set(text_cols)]

print("text_cols:", text_cols)
print("cat_cols:", cat_cols)
print("num_cols:", num_cols)

X_all_proc = preprocess_all_data(
    X_all,
    text_cols=text_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_freq=50,
    tfidf_max_features=80000,
)

n_train = len(X_train_raw)
X_train_proc = X_all_proc[:n_train]
X_test_proc  = X_all_proc[n_train:]

print("X_train_proc:", X_train_proc.shape, "X_test_proc:", X_test_proc.shape)


text_cols: ['Name']
cat_cols: ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating', 'Name_norm', 'Name_base', 'Franchise_key']
num_cols: ['Year_of_Release', 'NA_Sales', 'EU_Sales', 'Other_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'name_len', 'name_words', 'has_colon_or_dash', 'has_digit', 'has_roman', 'has_edition_word']
NUM: 14 cols -> (16719, 14)
CAT: 8 cols -> OHE shape (16719, 123)
TEXT 'Name': TF-IDF shape (16719, 64589), vocab=64589
TOTAL features: 64726
X_train_proc: (11703, 64726) X_test_proc: (5016, 64726)


In [7]:
# =========================
# CV: RIDGE (on processed)
# =========================
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def cv_oof_ridge(X_proc, y_series: pd.Series, cv, alpha: float = 2.0):
    oof = np.zeros(len(y_series), dtype=float)
    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_proc, y_series), 1):
        m = Ridge(alpha=alpha)
        m.fit(X_proc[tr_idx], y_series.iloc[tr_idx])
        pred = np.clip(m.predict(X_proc[va_idx]), 0, None)
        oof[va_idx] = pred
        mae = float(mean_absolute_error(y_series.iloc[va_idx], pred))
        r = rmse(y_series.iloc[va_idx], pred)
        fold_scores.append((mae, r))
        print(f"[Ridge][fold {fold}] MAE={mae:.6f} RMSE={r:.6f}")
    maes = np.array([s[0] for s in fold_scores])
    rmses = np.array([s[1] for s in fold_scores])
    print(f"Ridge: MAE mean={maes.mean():.6f} std={maes.std():.6f} | RMSE mean={rmses.mean():.6f} std={rmses.std():.6f}")
    return oof, fold_scores

oof_ridge, ridge_scores = cv_oof_ridge(X_train_proc, y, cv, alpha=2.0)


[Ridge][fold 1] MAE=0.075268 RMSE=0.188657
[Ridge][fold 2] MAE=0.078290 RMSE=0.232864
[Ridge][fold 3] MAE=0.074903 RMSE=0.194910
[Ridge][fold 4] MAE=0.080201 RMSE=0.263954
[Ridge][fold 5] MAE=0.075977 RMSE=0.217855
Ridge: MAE mean=0.076928 std=0.002016 | RMSE mean=0.219648 std=0.027252


In [8]:
# =========================
# CV: CATBOOST (CPU only, without Name text cols)
# =========================
def make_catboost_frame(df: pd.DataFrame) -> pd.DataFrame:
    # CatBoost нормально принимает pandas.DataFrame. Для категорий лучше string + fillna.
    out = df.copy()
    for c in out.columns:
        if str(out[c].dtype) in ("object", "category", "string", "bool"):
            out[c] = out[c].astype("string").fillna("__MISSING__")
    return out

def cv_oof_catboost(X_df: pd.DataFrame, y_series: pd.Series, cv):
    # Удаляем текстовые Name фичи из CatBoost (их обрабатывает Ridge)
    drop_cols = [c for c in ["Name", "Name_root"] if c in X_df.columns]
    X_cb = X_df.drop(columns=drop_cols).reset_index(drop=True)
    X_cb = make_catboost_frame(X_cb)

    cat_cols = X_cb.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
    cat_idx  = [X_cb.columns.get_loc(c) for c in cat_cols]

    oof = np.zeros(len(y_series), dtype=float)
    fold_scores = []
    best_iters = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_cb, y_series), 1):
        X_tr, X_va = X_cb.iloc[tr_idx], X_cb.iloc[va_idx]
        y_tr, y_va = y_series.iloc[tr_idx], y_series.iloc[va_idx]

        model = CatBoostRegressor(
            loss_function="MAE",
            eval_metric="MAE",
            iterations=20000,
            learning_rate=0.03,
            depth=8,
            l2_leaf_reg=6.0,
            random_seed=RANDOM_STATE,
            # CPU only (стабильнее, без требований к CUDA)
            task_type="CPU",
            # регуляризация
            subsample=0.8,
            rsm=0.8,
            bootstrap_type="Bernoulli",
            # контроль
            verbose=200,
        )

        model.fit(
            X_tr, y_tr,
            cat_features=cat_idx,
            eval_set=(X_va, y_va),
            use_best_model=True,
            early_stopping_rounds=500,
        )

        pred = np.clip(model.predict(X_va), 0, None)
        oof[va_idx] = pred

        mae = float(mean_absolute_error(y_va, pred))
        r = rmse(y_va, pred)
        fold_scores.append((mae, r))
        best_iters.append(int(model.get_best_iteration()))
        print(f"[CatBoost][fold {fold}] MAE={mae:.6f} RMSE={r:.6f} best_iter={best_iters[-1]}")

    maes = np.array([s[0] for s in fold_scores])
    rmses = np.array([s[1] for s in fold_scores])
    print(f"CatBoost: MAE mean={maes.mean():.6f} std={maes.std():.6f} | RMSE mean={rmses.mean():.6f} std={rmses.std():.6f}")

    return oof, fold_scores, best_iters

oof_cb, cb_scores, cb_best_iters = cv_oof_catboost(X_train_raw, y, cv)


0:	learn: 0.0821066	test: 0.0652163	best: 0.0652163 (0)	total: 55.1ms	remaining: 18m 22s
200:	learn: 0.0506573	test: 0.0476315	best: 0.0476312 (199)	total: 2.19s	remaining: 3m 35s
400:	learn: 0.0448817	test: 0.0462926	best: 0.0462823 (395)	total: 4.61s	remaining: 3m 45s
600:	learn: 0.0414196	test: 0.0460048	best: 0.0459483 (596)	total: 7.09s	remaining: 3m 49s
800:	learn: 0.0390945	test: 0.0460211	best: 0.0458778 (669)	total: 10.9s	remaining: 4m 20s
1000:	learn: 0.0373430	test: 0.0462556	best: 0.0458778 (669)	total: 14.9s	remaining: 4m 42s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.04587777651
bestIteration = 669

Shrink model to first 670 iterations.
[CatBoost][fold 1] MAE=0.045701 RMSE=0.179136 best_iter=669
0:	learn: 0.0752631	test: 0.0916652	best: 0.0916652 (0)	total: 12.5ms	remaining: 4m 10s
200:	learn: 0.0478139	test: 0.0602956	best: 0.0602956 (200)	total: 2.45s	remaining: 4m
400:	learn: 0.0421138	test: 0.0566446	best: 0.0566446 (400)	total: 5.17s	remaini

In [9]:
# =========================
# ENSEMBLE WEIGHT (grid search on OOF)
# =========================
weights = np.linspace(0, 1, 201)  # w = доля CatBoost
best = {"w": None, "mae": np.inf, "rmse": np.inf}

for w in weights:
    ens = w * oof_cb + (1 - w) * oof_ridge
    mae = float(mean_absolute_error(y, ens))
    r = rmse(y, ens)
    if mae < best["mae"]:
        best = {"w": float(w), "mae": mae, "rmse": r}

print("Best ensemble:", best)


Best ensemble: {'w': 1.0, 'mae': 0.05169282280828424, 'rmse': 0.21068200548958355}


In [10]:
# =========================
# FINAL FIT + PREDICT TEST
# =========================
# Ridge final on all processed train
ridge_final = Ridge(alpha=2.0)
ridge_final.fit(X_train_proc, y)
pred_ridge = np.clip(ridge_final.predict(X_test_proc), 0, None)

# CatBoost final: fit with holdout for early stopping
drop_cols = [c for c in ["Name", "Name_root"] if c in X_train_raw.columns]
X_cb_full = X_train_raw.drop(columns=drop_cols).reset_index(drop=True)
X_cb_test = X_test_raw.drop(columns=drop_cols).reset_index(drop=True)

X_cb_full = make_catboost_frame(X_cb_full)
X_cb_test = make_catboost_frame(X_cb_test)

cat_cols = X_cb_full.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
cat_idx  = [X_cb_full.columns.get_loc(c) for c in cat_cols]

X_tr, X_va, y_tr, y_va = train_test_split(
    X_cb_full, y, test_size=0.15, random_state=RANDOM_STATE
)

cb_final = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    iterations=30000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=6.0,
    random_seed=RANDOM_STATE,
    task_type="CPU",
    subsample=0.8,
    rsm=0.8,
    bootstrap_type="Bernoulli",
    verbose=200,
)

cb_final.fit(
    X_tr, y_tr,
    cat_features=cat_idx,
    eval_set=(X_va, y_va),
    use_best_model=True,
    early_stopping_rounds=500,
)

pred_cb = np.clip(cb_final.predict(X_cb_test), 0, None)

w = best["w"] if best["w"] is not None else 0.5
pred_ens = np.clip(w * pred_cb + (1 - w) * pred_ridge, 0, None)

print(f"Pred stats: ridge_mean={pred_ridge.mean():.4f}, cb_mean={pred_cb.mean():.4f}, ens_mean={pred_ens.mean():.4f}, w={w}")


0:	learn: 0.0813002	test: 0.0625222	best: 0.0625222 (0)	total: 28.8ms	remaining: 14m 23s
200:	learn: 0.0495531	test: 0.0453115	best: 0.0453115 (200)	total: 2.25s	remaining: 5m 34s
400:	learn: 0.0441989	test: 0.0439883	best: 0.0439749 (398)	total: 4.49s	remaining: 5m 31s
600:	learn: 0.0410714	test: 0.0435741	best: 0.0435383 (592)	total: 6.83s	remaining: 5m 34s
800:	learn: 0.0388561	test: 0.0434920	best: 0.0434107 (762)	total: 9.11s	remaining: 5m 31s
1000:	learn: 0.0371144	test: 0.0434901	best: 0.0434107 (762)	total: 11.4s	remaining: 5m 30s
1200:	learn: 0.0358232	test: 0.0434639	best: 0.0433869 (1136)	total: 13.7s	remaining: 5m 29s
1400:	learn: 0.0345856	test: 0.0432990	best: 0.0432972 (1399)	total: 16.1s	remaining: 5m 28s
1600:	learn: 0.0336457	test: 0.0431706	best: 0.0431656 (1578)	total: 18.4s	remaining: 5m 27s
1800:	learn: 0.0327350	test: 0.0430233	best: 0.0430156 (1731)	total: 20.8s	remaining: 5m 25s
2000:	learn: 0.0320434	test: 0.0431308	best: 0.0430156 (1731)	total: 23.2s	remainin

In [None]:
# =========================
# OPTUNA HYPERPARAMETER TUNING (для мощного сервера)
# =========================
import optuna
import json
import time
import os
from datetime import datetime
from optuna.samplers import TPESampler

# Файл для сохранения лучших параметров
BEST_PARAMS_FILE = "optuna_best_params.json"
STUDY_NAME = "catboost_jp_sales"
N_TRIALS = 2000  # Увеличьте для более тщательного поиска
SAVE_INTERVAL_SEC = 600  # 10 минут

# Глобальные переменные для отслеживания времени
last_save_time = time.time()

def save_best_params(study, trial=None):
    """Сохраняет лучшие параметры в JSON."""
    global last_save_time
    
    current_time = time.time()
    
    # Сохраняем каждые 10 минут или при завершении trial
    if current_time - last_save_time >= SAVE_INTERVAL_SEC or trial is None:
        if study.best_trial is not None:
            result = {
                "timestamp": datetime.now().isoformat(),
                "best_value": study.best_value,
                "best_params": study.best_params,
                "n_trials_completed": len(study.trials),
                "best_trial_number": study.best_trial.number,
            }
            
            with open(BEST_PARAMS_FILE, 'w') as f:
                json.dump(result, f, indent=2)
            
            print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Saved best params: MAE={study.best_value:.6f}")
            last_save_time = current_time

def objective(trial):
    """Функция оптимизации для Optuna."""
    
    # Сначала выбираем тип bootstrap
    bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    
    # Базовые гиперпараметры
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.15, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 20.0),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0),  # colsample_bylevel
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'bootstrap_type': bootstrap_type,
    }
    
    # Параметры, зависящие от типа bootstrap
    if bootstrap_type == 'Bayesian':
        # Bayesian использует bagging_temperature, НЕ subsample
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 5.0)
    elif bootstrap_type in ['Bernoulli', 'MVS']:
        # Bernoulli/MVS используют subsample, НЕ bagging_temperature
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
    
    # Подготовка данных (как в основном коде)
    drop_cols = [c for c in ["Name", "Name_root"] if c in X_train_raw.columns]
    X_cb = X_train_raw.drop(columns=drop_cols).reset_index(drop=True)
    X_cb = make_catboost_frame(X_cb)
    
    cat_cols = X_cb.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
    cat_idx  = [X_cb.columns.get_loc(c) for c in cat_cols]
    
    # CV для оценки
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    fold_maes = []
    
    for fold, (tr_idx, va_idx) in enumerate(cv_inner.split(X_cb), 1):
        X_tr, X_va = X_cb.iloc[tr_idx], X_cb.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        
        model = CatBoostRegressor(
            loss_function="MAE",
            eval_metric="MAE",
            iterations=10000,
            **params,
            random_seed=RANDOM_STATE,
            task_type="CPU",
            verbose=0,
        )
        
        model.fit(
            X_tr, y_tr,
            cat_features=cat_idx,
            eval_set=(X_va, y_va),
            use_best_model=True,
            early_stopping_rounds=200,
        )
        
        pred = np.clip(model.predict(X_va), 0, None)
        mae = float(mean_absolute_error(y_va, pred))
        fold_maes.append(mae)
        
        trial.report(np.mean(fold_maes), fold)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return np.mean(fold_maes)

# Создаём study с возможностью продолжения
storage = f"sqlite:///{STUDY_NAME}.db"
sampler = TPESampler(seed=RANDOM_STATE)
pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=2)

study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=storage,
    load_if_exists=True,  # Продолжить если прервано
    direction="minimize",
    sampler=sampler,
    pruner=pruner,
)

print(f"Starting Optuna optimization: {N_TRIALS} trials")
print(f"Best params will be saved to '{BEST_PARAMS_FILE}' every {SAVE_INTERVAL_SEC//60} minutes")
print(f"Study stored in '{STUDY_NAME}.db' (can resume if interrupted)")
print("="*60)

# Запуск оптимизации с callback
study.optimize(
    objective,
    n_trials=N_TRIALS,
    callbacks=[save_best_params],
    show_progress_bar=True,
    timeout= 600,
)

# Финальное сохранение
save_best_params(study)

print("\n" + "="*60)
print("OPTIMIZATION COMPLETE!")
print(f"Best MAE: {study.best_value:.6f}")
print(f"Best params: {json.dumps(study.best_params, indent=2)}")


[I 2026-01-05 21:11:22,665] Using an existing study with name 'catboost_jp_sales' instead of creating a new one.


Starting Optuna optimization: 2000 trials
Best params will be saved to 'optuna_best_params.json' every 10 minutes
Study stored in 'catboost_jp_sales.db' (can resume if interrupted)


  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
# =========================
# ВИЗУАЛИЗАЦИЯ РЕЗУЛЬТАТОВ OPTUNA
# =========================
try:
    import optuna.visualization as vis
    
    # История оптимизации
    fig1 = vis.plot_optimization_history(study)
    fig1.show()
    
    # Важность параметров
    fig2 = vis.plot_param_importances(study)
    fig2.show()
    
    # Slice plot
    fig3 = vis.plot_slice(study)
    fig3.show()
    
except ImportError:
    print("Install plotly for visualizations: pip install plotly")
    print("\nTop 10 trials:")
    trials_df = study.trials_dataframe()
    print(trials_df.nsmallest(10, 'value')[['number', 'value', 'params_learning_rate', 'params_depth', 'params_l2_leaf_reg']])


In [None]:
# =========================
# ФИНАЛЬНОЕ ОБУЧЕНИЕ С ЛУЧШИМИ ПАРАМЕТРАМИ
# =========================

# Загружаем лучшие параметры из файла или study
if os.path.exists(BEST_PARAMS_FILE):
    with open(BEST_PARAMS_FILE, 'r') as f:
        saved_result = json.load(f)
    best_params = saved_result['best_params']
    print(f"Loaded best params from {BEST_PARAMS_FILE}")
else:
    best_params = study.best_params
    print("Using params from study object")

print(f"Best params: {json.dumps(best_params, indent=2)}")

# Подготовка данных
drop_cols = [c for c in ["Name", "Name_root"] if c in X_train_raw.columns]
X_cb_full = X_train_raw.drop(columns=drop_cols).reset_index(drop=True)
X_cb_test = X_test_raw.drop(columns=drop_cols).reset_index(drop=True)

X_cb_full = make_catboost_frame(X_cb_full)
X_cb_test = make_catboost_frame(X_cb_test)

cat_cols = X_cb_full.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
cat_idx  = [X_cb_full.columns.get_loc(c) for c in cat_cols]

X_tr, X_va, y_tr, y_va = train_test_split(
    X_cb_full, y, test_size=0.15, random_state=RANDOM_STATE
)

# Финальная модель с найденными параметрами
cb_optuna_final = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    iterations=30000,  # Больше итераций для финала
    **best_params,
    random_seed=RANDOM_STATE,
    task_type="CPU",
    bootstrap_type="Bayesian" if best_params.get('bagging_temperature', 0) > 0 else "Bernoulli",
    verbose=200,
)

cb_optuna_final.fit(
    X_tr, y_tr,
    cat_features=cat_idx,
    eval_set=(X_va, y_va),
    use_best_model=True,
    early_stopping_rounds=500,
)

pred_cb_optuna = np.clip(cb_optuna_final.predict(X_cb_test), 0, None)

# Ensemble с Ridge
w = best["w"] if best["w"] is not None else 0.5
pred_ens_optuna = np.clip(w * pred_cb_optuna + (1 - w) * pred_ridge, 0, None)

print(f"\nPred stats: cb_mean={pred_cb_optuna.mean():.4f}, ens_mean={pred_ens_optuna.mean():.4f}, w={w}")

# Сохраняем submission
sub_optuna = pd.DataFrame({"Id": test_ids, "JP_Sales": pred_ens_optuna})
sub_optuna.to_csv("sub_optuna.csv", index=False)
print("Saved submission to sub_optuna.csv")


In [None]:
# =========================
# SUBMISSION
# =========================
sub = pd.DataFrame({"Id": test_ids, "JP_Sales": pred_ens})
sub.to_csv("sub_with_name.csv", index=False)
sub.head(10)
