In [None]:
import os
import polars as pl
import pandas as pd
import numpy as np
import tqdm
import time
import gc
from math import isclose
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostRanker, Pool

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_parquet(f"train_fe_180.parquet")
test  = pd.read_parquet(f"test_fe_180.parquet")

In [10]:
train.head()

Unnamed: 0,ts_timestamp,search_term_normalized,content_id_hashed,session_id,clicked,ordered,c_search_imp_mean,c_search_clk_mean,c_search_imp_std,c_search_clk_std,...,filterable_z_in_leaf,search_ctr_bs_sess_minmax,u_search_imp_std_sess_minmax,c_tterm_clk_mean_sess_minmax,u_tterm_lift_sess_minmax,click_to_order_rate_sess_minmax,c_tterm_clk_std_sess_minmax,search_ctr_sess_minmax,price_diff_from_session_med_sess_minmax,u_search_imp_mean_sess_minmax
0,1752166,a_elbise_kesim,ae9b536d26e602f4,train_4fd3705b497bbe4f,1.0,0.0,0.00289,5.4e-05,0.001449,4.2e-05,...,0.0,0.007812,0.0,1.0,0.0,0.073975,1.0,0.076355,0.274414,0.0
1,1752166,a_elbise_kesim,91b8a11e4c8e0ded,train_4fd3705b497bbe4f,0.0,0.0,0.010849,0.000461,0.007557,0.00032,...,0.333252,0.127441,0.0,1.0,0.0,0.017548,1.0,0.413574,0.317627,0.0
2,1752166,a_elbise_kesim,1a78e29f4bcc58b0,train_4fd3705b497bbe4f,0.0,0.0,0.003828,0.00011,0.002554,7.4e-05,...,0.333252,0.025894,0.0,1.0,0.0,0.144531,1.0,0.215942,0.734375,0.0
3,1752166,a_elbise_kesim,604a4c8b13d798a4,train_4fd3705b497bbe4f,0.0,0.0,0.009636,0.000293,0.009766,0.000353,...,0.0,0.051208,0.0,1.0,0.0,0.148193,1.0,0.241821,0.187866,0.0
4,1752166,a_elbise_kesim,038680d15f693ae2,train_4fd3705b497bbe4f,0.0,0.0,0.010582,0.000409,0.003056,0.000113,...,-0.166626,0.102356,0.0,1.0,0.0,0.037598,1.0,0.358643,0.674805,0.0


In [11]:
%%time
if True:

    def reduce_mem_usage(df: pd.DataFrame, use_categorical=True, verbose=True):
        start_mem = df.memory_usage(deep=True).sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtype
            if pd.api.types.is_numeric_dtype(col_type):
                c_min, c_max = df[col].min(), df[col].max()
                if pd.api.types.is_integer_dtype(col_type):
                    for t in (np.int8, np.int16, np.int32):
                        if c_min >= np.iinfo(t).min and c_max <= np.iinfo(t).max:
                            df[col] = df[col].astype(t); break
                else:
                    for t in (np.float16, np.float32):
                        df[col] = df[col].astype(t)
                        # İsterseniz hataya karşı: df[col] = pd.to_numeric(df[col], downcast="float")
            elif use_categorical and df[col].dtype == "object":
                # Sık tekrar eden metinlerde çok etkili
                num_unique = df[col].nunique(dropna=False)
                num_total  = len(df[col])
                if num_unique / num_total < 0.5:
                    df[col] = df[col].astype("category")
        end_mem = df.memory_usage(deep=True).sum() / 1024**2
        if verbose:
            print(f"Mem {start_mem:.1f} MB → {end_mem:.1f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% kazanç)")
        return df

    train = reduce_mem_usage(train)
    test  = reduce_mem_usage(test)

Mem 2678.3 MB → 2678.3 MB (0.0% kazanç)
Mem 2879.3 MB → 2879.3 MB (0.0% kazanç)
CPU times: user 23.5 s, sys: 834 ms, total: 24.3 s
Wall time: 24.3 s


In [12]:
# --- Build feature lists (train/test aligned) ---

LABEL_COLS = ["clicked", "ordered"]          # only in train
GROUP_COLS = ["session_id"]
TIME_COLS  = ["ts_timestamp"]
EXCLUDE    = set(LABEL_COLS + GROUP_COLS + TIME_COLS + [])  # add more if needed

# columns that exist in BOTH train & test
common_cols = [c for c in train.columns if c in test.columns and c not in EXCLUDE]

# categorical dtypes
def is_cat(s: pd.Series) -> bool:
    return (
        pd.api.types.is_categorical_dtype(s) or
        pd.api.types.is_object_dtype(s) or
        str(s.dtype) == "string"
    )

# auto-detect cats; merge with any existing cat_features you built earlier
cat_features_auto = [c for c in common_cols if is_cat(train[c])]
if "cat_features" in globals():
    cat_features = sorted({c for c in (cat_features_auto + cat_features) if c in common_cols})
else:
    cat_features = sorted(cat_features_auto)

# numerics = remaining numeric columns
num_features = sorted([c for c in common_cols if c not in cat_features and pd.api.types.is_numeric_dtype(train[c])])

print(f"#num_features={len(num_features)}  #cat_features={len(cat_features)}")

#num_features=157  #cat_features=20


In [None]:
# ---------------- CONFIG ----------------
num_features = [
    'affinity_lift',
    'attr_option_per_type',
    'attr_type_z_in_leaf',
    'attribute_type_count',
    'c_cart_mean',
    'c_click_cv',
    'c_click_mean',
    'c_fav_mean',
    'c_order_mean',
    'c_search_clk_mean',
    'c_search_clk_std',
    'c_search_cv',
    'c_search_imp_mean',
    'c_search_imp_std',
    'c_tterm_clk_mean',
    'c_tterm_clk_mean_sess_minmax',
    'c_tterm_clk_std',
    'c_tterm_clk_std_sess_minmax',
    'c_tterm_imp_mean',
    'c_tterm_imp_std',
    'category_vs_user_pref_ratio',
    'click_to_order_bs',
    'click_to_order_rate',
    'click_to_order_rate_sess_minmax',
    'content_rate_avg',
    'content_rate_count',
    'content_review_count',
    'content_review_wth_media_count',
    'dayofweek',
    'disc_z_in_l1',
    'disc_z_in_leaf',
    'discount_diff_from_session_max',
    'discount_norm_in_session',
    'discount_rank_in_category',
    'discount_rank_in_session',
    'discount_ratio',
    'discount_vs_user_pref_proxy',
    'discounted_price',
    'filterable_label_count',
    'filterable_z_in_leaf',
    'global_l1_click_sum',
    'global_l1_order_rate',
    'global_l2_click_sum',
    'global_l2_order_rate',
    'leaf_cart_per_click',
    'leaf_cart_rate_lift',
    'leaf_cart_sum',
    'leaf_click_share',
    'leaf_click_sum',
    'leaf_clicks_per_item',
    'leaf_cvr',
    'leaf_cvr_lift',
    'leaf_engagement_per_click',
    'leaf_fav_per_click',
    'leaf_fav_rate_lift',
    'leaf_fav_sum',
    'leaf_item_count',
    'leaf_order_per_cart',
    'leaf_order_per_fav',
    'leaf_order_share',
    'leaf_order_sum',
    'leaf_orders_per_item',
    'leaf_tilt',
    'media_review_share',
    'merchant_count',
    'merchant_z_in_leaf',
    'original_price',
    'price_diff_from_session_med',
    'price_diff_from_session_med_sess_minmax',
    'price_norm_in_session',
    'price_rank_in_category',
    'price_rank_in_session',
    'price_vs_user_pref_proxy',
    'price_z_in_l1',
    'price_z_in_leaf',
    'quality_norm_in_session',
    'quality_rank_in_session',
    'quality_z_in_l1',
    'quality_z_in_leaf',
    'rate_diff_from_session_max',
    'rate_weighted',
    'same_l1_count_in_session',
    'same_l1_share_in_session',
    'same_leaf_count_in_session',
    'same_leaf_share_in_session',
    'scarcity_index',
    'search_ctr',
    'search_ctr_bs',
    'search_ctr_bs_sess_minmax',
    'search_ctr_sess_minmax',
    'selling_price',
    'session_category_entropy',
    'session_category_match_ratio',
    'session_price_diff_from_avg',
    'session_price_iqr',
    'session_size',
    't_clk_mean',
    't_clk_std',
    't_ctr_global',
    't_imp_mean',
    't_imp_std',
    'tc_lift_vs_content',
    'tc_lift_vs_term',
    'tc_term_ctr',
    'total_attribute_option_count',
    'u_c_click_rate',
    'u_c_order_rate',
    'u_cart_mean',
    'u_cart_std',
    'u_click_cv',
    'u_click_mean',
    'u_click_std',
    'u_fav_mean',
    'u_fav_std',
    'u_l1_click_mean',
    'u_l1_order_mean',
    'u_l1_order_rate',
    'u_leaf_click_mean',
    'u_leaf_click_rate',
    'u_leaf_order_mean',
    'u_order_cv',
    'u_order_mean',
    'u_order_std',
    'u_search_clk_mean',
    'u_search_clk_std',
    'u_search_imp_mean',
    'u_search_imp_std',
    'u_tterm_clk_mean',
    'u_tterm_clk_std',
    'u_tterm_ctr',
    'u_tterm_cv',
    'u_tterm_imp_mean',
    'u_tterm_imp_std',
    'u_tterm_lift',
    'uf_u_clk_mean',
    'uf_u_clk_std',
    'uf_u_imp_mean',
    'uf_u_imp_std',
    'uf_uc_clk_mean',
    'uf_uc_clk_std',
    'uf_uc_imp_mean',
    'uf_uc_imp_std',
    'ufs_u_cart_mean',
    'ufs_u_cart_std',
    'ufs_u_click_mean',
    'ufs_u_click_std',
    'ufs_u_fav_mean',
    'ufs_u_fav_std',
    'ufs_u_order_mean',
    'ufs_u_order_std',
    'ufs_uc_cart_mean',
    'ufs_uc_cart_std',
    'ufs_uc_click_mean',
    'ufs_uc_click_std',
    'ufs_uc_fav_mean',
    'ufs_uc_fav_std',
    'ufs_uc_order_mean',
    'ufs_uc_order_std',
    'user_age_years',
    'user_birth_year',
    'user_item_cart_to_order_rate',
    'user_item_click_to_cart_rate',
    'user_item_click_to_order_rate',
    'user_tenure_in_days']

cat_features = [
    'c_order_mean_bin',
    'c_order_mean_bin__x__user_gender',
    'content_id_hashed',
    'cv_tags',
    'discount_ratio_bin',
    'discount_ratio_bin__x__c_order_mean_bin',
    'discount_ratio_bin__x__selling_price_bin',
    'discount_ratio_bin__x__user_gender',
    'l1_hour',
    'l1_searchterm',
    'l2_hour',
    'leaf_category_name',
    'leaf_searchterm',
    'level1_category_name',
    'level2_category_name',
    'search_term_normalized',
    'selling_price_bin',
    'selling_price_bin__x__c_order_mean_bin',
    'selling_price_bin__x__user_gender',
    'user_gender']

# Listeye ekle
feature_cols = num_features + cat_features

# Label & grup/id kolonları
label_clicked = "clicked"
label_ordered = "ordered"
group_col     = "session_id"
id_col        = "content_id_hashed"

# ---------------- Güvenlik kontrolleri ----------------
# train/test içinde olmayan feature'ları ayıkla (özellikle hızlı iterasyonlarda faydalı)
_missing_in_train = [c for c in feature_cols if c not in train.columns]
if _missing_in_train:
    print("Uyarı: train'de eksik bulunan kolonlar atılıyor:", _missing_in_train)
    feature_cols = [c for c in feature_cols if c in train.columns]
    # cat_features'ı da filtrele
    cat_features = [c for c in cat_features if c in feature_cols]

Uyarı: train'de eksik bulunan kolonlar atılıyor: ['global_l2_click_sum', 'global_l2_order_rate', 'leaf_cart_per_click', 'leaf_cart_rate_lift', 'leaf_cart_sum', 'leaf_click_share', 'leaf_click_sum', 'leaf_clicks_per_item', 'leaf_cvr', 'leaf_cvr_lift', 'leaf_engagement_per_click', 'leaf_fav_per_click', 'leaf_fav_rate_lift', 'leaf_fav_sum', 'leaf_item_count', 'leaf_order_per_cart', 'leaf_order_per_fav', 'leaf_order_share', 'leaf_order_sum', 'leaf_orders_per_item', 'leaf_tilt']


In [16]:
%%time
if False:
    # ---- Ayarlar ----
    corr_thresh = 0.99
    protect = {group_col, label_clicked, label_ordered}
    # varsa id_col'ü de koru:
    if 'id_col' in globals():
        protect.add(id_col)

    # Sadece feature'lar içinde ve numeric olanları kullan
    usable_feats = [c for c in feature_cols if c not in protect]
    num_feats = [c for c in usable_feats if pd.api.types.is_numeric_dtype(train[c])]

    # --- Korelasyon (train, leakage yok) ---
    corr = train[num_feats].corr().abs()

    # Üst üçgen yöntemi: soldakini tut, sağdakileri at
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if (upper[col] > corr_thresh).any()]

    print(f"[CorrFilter] threshold={corr_thresh} | "
          f"numeric_feats={len(num_feats)} | drop_count={len(to_drop)}")

    # İstersen hangi çiftlerden dolayı atıldığına kısa özet:
    dropped_pairs = []
    for col in to_drop:
        partners = upper.index[upper[col] > corr_thresh].tolist()
        for p in partners:
            dropped_pairs.append((p, col, float(corr.loc[p, col])))
    dropped_pairs = sorted(dropped_pairs, key=lambda x: -x[2])[:20]
    for a,b,v in dropped_pairs:
        print(f" drop '{b}' (kept '{a}') corr={v:.3f}")

    # --- Train/Test'ten at ---
    train.drop(columns=[c for c in to_drop if c in train.columns], inplace=True)
    test.drop(columns=[c for c in to_drop if c in test.columns], inplace=True)

    # --- Listeleri güncelle ---
    feature_cols = [c for c in feature_cols if c not in to_drop]
    if 'cat_features' in globals():
        cat_features = [c for c in cat_features if c not in to_drop]

    print(f"[CorrFilter] feature_cols -> {len(feature_cols)} cols after drop")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [17]:
if True:
    # 1) Train setindeki sabit (constant) kolonları bul
    const_cols = [col for col in train.columns if train[col].nunique(dropna=False) <= 1]

    print("Constant columns:", const_cols)

    # 2) Train ve Test setlerinden bu kolonları düşür
    train = train.drop(columns=const_cols)
    test  = test.drop(columns=const_cols)

Constant columns: ['session_elapsed_sec', 'hour', 'session_elapsed_min', 'popularity_ratio_recent', 'order_trend', 'cart_to_order_bs', 'fav_to_order_bs', 'u_search_imp_std_sess_minmax', 'u_tterm_lift_sess_minmax', 'u_search_imp_mean_sess_minmax']


In [18]:
# 3) Feature listelerinden de düşür
feature_cols = [c for c in feature_cols if c not in const_cols]
cat_features = [c for c in cat_features if c not in const_cols]

In [19]:
#preprocessinge taşındı
if False:
    # Leak riski olan kolonları hem DataFrame'lerden hem de listelerden çıkar
    leaky_cols = [
        "item_price_med_ordered",
        "item_disc_med_ordered",
        "price_gap_to_item_pref",
        "disc_gap_to_item_pref",
    ]

    # 1) train / test'ten drop
    for df in (train, test):
        df.drop(columns=[c for c in leaky_cols if c in df.columns], inplace=True, errors="ignore")

    # 2) feature_cols ve num_features'tan temizle
    feature_cols = [c for c in feature_cols if c not in leaky_cols]
    num_features = [c for c in num_features if c not in leaky_cols]

# NaN kategorikleri "unk" ile doldur (train & test)
UNK = "unknown"

# Sadece modelde kullanılacak ve mevcut olan kategorikler
cat_cols = [c for c in cat_features if c in feature_cols and c in train.columns and c in test.columns]

def fill_cat_nans(df, cols, unk=UNK):
    for c in cols:
        # hepsini string/object yap ve NaN'ları "unk" ile doldur
        df[c] = df[c].astype("object")
        df[c] = df[c].where(pd.notna(df[c]), unk).astype(str)
    return df

train = fill_cat_nans(train, cat_cols)
test  = fill_cat_nans(test,  cat_cols)

# (opsiyonel) kontrol
print("train NaN kalan cat cols:", [c for c in cat_cols if train[c].isna().any()])
print("test NaN kalan cat cols:",  [c for c in cat_cols if test[c].isna().any()])

train NaN kalan cat cols: []
test NaN kalan cat cols: []


In [26]:
# ---------------- METRIC ----------------
W_CLICK, W_ORDER = 0.3, 0.7

def _auc_safe(y, s):
    if y.sum() == 0 or y.sum() == len(y):
        return None
    return roc_auc_score(y, s)

def session_mean_auc_pd(df, y_col, score_col):
    aucs = []
    for _, g in df.groupby(group_col, sort=False):
        if g[y_col].sum() == 0 or g[y_col].sum() == len(g[y_col]):
            continue
        auc = _auc_safe(g[y_col].to_numpy(), g[score_col].to_numpy())
        if auc is not None:
            aucs.append(auc)
    return (float(np.mean(aucs)) if aucs else np.nan, len(aucs))

def trendyol_cv_metric(df, score_col="score"):
    click_mean, n_click = session_mean_auc_pd(df, label_clicked, score_col)
    order_mean, n_order = session_mean_auc_pd(df, label_ordered, score_col)
    final = W_CLICK * click_mean + W_ORDER * order_mean
    return final, click_mean, order_mean, n_click, n_order

# ---------------- MODEL (CatBoostRanker, pairwise) ----------------
clicked_params_1 = dict(
    depth=6,
    iterations=5000,
    learning_rate=0.02,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=3,
)

clicked_params_2 = dict(
    depth=6,
    iterations=5000,
    learning_rate=0.025,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=1,
    l2_leaf_reg=29.7,
    min_data_in_leaf=651,
)

clicked_params_3 = dict(
    depth=7,
    iterations=2000,
    learning_rate=0.03,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=1,
    l2_leaf_reg=20.6,
    min_data_in_leaf=628,
)

ordered_params_1 = dict(
    depth=6,
    iterations=2000,
    learning_rate=0.02,
    l2_leaf_reg=35.5,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=2,
    min_data_in_leaf=2264,
)

ordered_params_2 = dict(
    depth=6,
    iterations=3000,
    learning_rate=0.007,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=3,
)

ordered_params_3 = dict(
    depth=6,
    iterations=2000,
    learning_rate=0.034747183605270626,
    l2_leaf_reg=3.314322202814573,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    # od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=2,
    min_data_in_leaf=2336,
)

def _make_pool(df: pd.DataFrame, y_col: str):
    X = df[feature_cols]
    y = pd.to_numeric(df[y_col], errors="coerce").fillna(0).astype(int).values
    # group_id: stabil int koda çevir (fold içinde)
    grp = pd.factorize(df[group_col], sort=False)[0].astype(np.int64)
    return Pool(data=X, label=y, group_id=grp, cat_features=cat_features)

def train_catboost_ranker(tr_df: pd.DataFrame, va_df: pd.DataFrame, y_col: str, params: dict):
    train_pool = _make_pool(tr_df, y_col)
    valid_pool = _make_pool(va_df, y_col)
    model = CatBoostRanker(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    del train_pool, valid_pool; gc.collect()
    return model

In [27]:
def _prepare_group_slices_for_va(va_df: pd.DataFrame):
    """
    VA satırlarını grup koduna göre contiguous hale getirir.
    Dönüş:
      order_idx: VA sıralama indeksi
      starts, ends: her oturum için [start, end) sınırları (kod sırasına göre)
      n_groups: oturum sayısı
    """
    codes, uniques = pd.factorize(va_df[group_col], sort=True)
    order_idx = np.argsort(codes, kind='mergesort')
    codes_sorted = codes[order_idx]
    n_groups = int(codes_sorted.max()) + 1 if len(codes_sorted) else 0
    counts = np.bincount(codes_sorted, minlength=n_groups)
    starts = np.zeros_like(counts)
    ends = np.zeros_like(counts)
    acc = 0
    for i, c in enumerate(counts):
        starts[i] = acc
        acc += c
        ends[i] = acc
    return order_idx, starts, ends, n_groups

def weight_sweep_indexed(
    y_click_sorted, y_order_sorted, p_click_sorted, p_order_sorted,
    starts, ends,
    w_center=0.50, delta=0.2, step=0.02,
    metr_w_click=0.3, metr_w_order=0.7,
    fold=None
):
    """
    Her weight için AYRI skor hesaplayıp (s = w*pc + (1-w)*po),
    her oturum için roc_auc_score ile AUC alır; final = metr_w_click*click_auc + metr_w_order*order_auc.
    Tüm denemeleri print eder, en iyiyi döndürür.
    """
    w_list = np.arange(w_center - delta, w_center + delta + 1e-9, step, dtype=np.float32)
    w_list = w_list[(w_list >= 0.0) & (w_list <= 1.0)]
    if w_list.size == 0:
        w_list = np.array([w_center], dtype=np.float32)

    if fold is not None:
        print(f"\n[Fold {fold}] Weight sweep around {w_center:.2f}")
        print(" w_click  w_order   FINAL      ClickAUC   OrderAUC   (n_click, n_order)")

    finals, cauc_list, oauc_list, nC_used, nO_used = [], [], [], [], []

    for w in w_list:
        s = w * p_click_sorted + (1.0 - w) * p_order_sorted

        aucs_c, aucs_o = [], []
        for g in range(len(starts)):
            a, b = int(starts[g]), int(ends[g])
            if b - a <= 1:
                continue

            yc = y_click_sorted[a:b]
            yo = y_order_sorted[a:b]
            sg = s[a:b]

            # click AUC
            sum_yc = yc.sum()
            if sum_yc > 0 and sum_yc < (b - a):
                aucs_c.append(roc_auc_score(yc, sg))

            # order AUC
            sum_yo = yo.sum()
            if sum_yo > 0 and sum_yo < (b - a):
                aucs_o.append(roc_auc_score(yo, sg))

        click_mean = (np.mean(aucs_c) if len(aucs_c) else np.nan)
        order_mean = (np.mean(aucs_o) if len(aucs_o) else np.nan)
        final = metr_w_click * click_mean + metr_w_order * order_mean

        finals.append(final); cauc_list.append(click_mean); oauc_list.append(order_mean)
        nC_used.append(len(aucs_c)); nO_used.append(len(aucs_o))

        if fold is not None:
            print(f" {w:6.2f}   {1-w:6.2f}   {final:8.5f}   "
                  f"{click_mean:9.5f}   {order_mean:9.5f}   "
                  f"({len(aucs_c):5d}, {len(aucs_o):5d})")

    finals = np.asarray(finals, dtype=np.float64)
    cauc_list = np.asarray(cauc_list, dtype=np.float64)
    oauc_list = np.asarray(oauc_list, dtype=np.float64)
    best_idx = int(np.nanargmax(finals))
    return {
        "grid_w": w_list,
        "grid_final": finals,
        "grid_cauc": cauc_list,
        "grid_oauc": oauc_list,
        "best_w_click": float(w_list[best_idx]),
        "best_w_order": float(1.0 - w_list[best_idx]),
        "best_final": float(finals[best_idx]),
        "best_cauc": float(cauc_list[best_idx]),
        "best_oauc": float(oauc_list[best_idx]),
        "n_click_used": int(nC_used[best_idx]),
        "n_order_used": int(nO_used[best_idx]),
    }

In [28]:
gc.collect()

127

In [None]:
%%time
# ============================== TIME-BASED TRAIN → VAL (Multi-HP blend ready) ==============================

train = train.reindex(sorted(train.columns), axis=1)
test  = test.reindex(sorted(test.columns), axis=1)

# ---------------- 1) Split ----------------
ts_vals = np.sort(pd.unique(train["ts_timestamp"]))   # artan sırada
assert ts_vals.size >= 2, "En az 2 farklı ts_timestamp olmalı."
val_ts   = ts_vals[-1]          # en büyük (son gün)
train_ts = ts_vals[:-1]         # kalan tüm günler

print(f"[TimeSplit] train_ts={train_ts.tolist()} | val_ts={val_ts}")

tr_df_full = train[train["ts_timestamp"].isin(train_ts)].copy()
va_df      = train[train["ts_timestamp"] == val_ts].copy()

# ---------------- 2) Session bayrakları ----------------
_tmp = tr_df_full[[group_col, label_clicked, label_ordered]].copy()
_tmp = _tmp[_tmp[group_col].notna()]
for lbl in (label_clicked, label_ordered):
    _tmp[lbl] = pd.to_numeric(_tmp[lbl], errors="coerce").fillna(0.0)

sess_flags = _tmp.groupby(group_col, sort=False)[[label_clicked, label_ordered]].max()
has_click = sess_flags[label_clicked] > 0
has_order = sess_flags[label_ordered] > 0
both_sessions       = sess_flags.index[has_click & has_order]
click_only_sessions = sess_flags.index[has_click & ~has_order]

# UNDERSAMPLING OFF
rng = np.random.RandomState(2025)
if len(click_only_sessions) > 0:
    k = max(1, int(round(1 * len(click_only_sessions))))
    sampled_click_only = rng.choice(click_only_sessions, size=k, replace=False)
else:
    sampled_click_only = np.array([], dtype=object)

# ORDER train: yalnızca BOTH
if len(both_sessions) == 0:
    print("[WARN] Train'de BOTH session yok; ORDER modeline tüm train verildi.")
    tr_df_order = tr_df_full.copy()
else:
    tr_df_order = tr_df_full[tr_df_full[group_col].isin(both_sessions)].copy()

# CLICK train: BOTH ∪ click-only
click_keep_sessions = np.concatenate([both_sessions, sampled_click_only])
tr_df_click = (tr_df_full if click_keep_sessions.size == 0
               else tr_df_full[tr_df_full[group_col].isin(click_keep_sessions)].copy())

# VALIDATION (ORDER için): en az 1 order içeren session'larla sınırla; boşsa tüm valid
va_order_sess_mask = (va_df.groupby(group_col, sort=False)[label_ordered].max() > 0)
va_df_order_eval = va_df[va_df[group_col].isin(va_order_sess_mask.index[va_order_sess_mask])].copy()
if va_df_order_eval.empty:
    va_df_order_eval = va_df.copy()

print("\n" + "="*80)
print(f"[OOT] ORDER-train: {len(tr_df_order)} rows / {tr_df_order[group_col].nunique()} sessions (BOTH)")
print(f"[OOT] CLICK-train: {len(tr_df_click)} rows / {tr_df_click[group_col].nunique()} sessions")
print(f"[OOT] valid      : {len(va_df)} rows / {va_df[group_col].nunique()} sessions")
print(f"[OOT] feature count: {len(feature_cols)}")

# ---------------- 3) Param setlerini listele ----------------
clicked_param_list = [clicked_params_1, clicked_params_2, clicked_params_3]
ordered_param_list = [ordered_params_1, ordered_params_2, ordered_params_3]

# ---------------- 4) Modelleri eğit (her aile için çoklu) ----------------
print("\n[OOT] Training CLICK rankers (multi-HP)...")
m_click_list = []
for i, p in enumerate(clicked_param_list, 1):
    print(f"  - CLICK model #{i} with params idx={i}")
    m = train_catboost_ranker(tr_df_click, va_df, label_clicked, p)
    m_click_list.append(m)

print("\n[OOT] Training ORDER rankers (multi-HP)...")
m_order_list = []
for i, p in enumerate(ordered_param_list, 1):
    print(f"  - ORDER model #{i} with params idx={i}")
    m = train_catboost_ranker(tr_df_order, va_df_order_eval, label_ordered, p)
    m_order_list.append(m)

# ---------------- 5) Valid üzerinde tek tek tahminler (matris halinde) ----------------
va_df = va_df.copy()

# Tüm valid (tam gün) üzerinde predict — submission ve final metrik buna göre
va_pool_click_all = _make_pool(va_df, label_clicked)
va_pool_order_all = _make_pool(va_df, label_ordered)

# Matrisler: [n_rows, n_models]
P_click_mat = np.column_stack([m.predict(va_pool_click_all) for m in m_click_list]).astype(np.float32)
P_order_mat = np.column_stack([m.predict(va_pool_order_all) for m in m_order_list]).astype(np.float32)

# (İsteğe bağlı) normalize et — blend için ölçek dengeleme (default: kapalı)
NORM_BEFORE_BLEND = False
if NORM_BEFORE_BLEND:
    def _z(a):
        a = a.astype(np.float32)
        mu = a.mean(axis=0, keepdims=True)
        sd = a.std(axis=0, keepdims=True) + 1e-8
        return (a - mu) / sd
    P_click_mat = _z(P_click_mat)
    P_order_mat = _z(P_order_mat)

# ---------------- 6) Aile içi blend (şimdilik eşit ağırlık) ----------------
# -> Bu iki ağırlık vektörü daha sonra OPTİMİZE edilmeye hazır (grid/optuna/coordinate descent)
C_WEIGHTS = np.full(P_click_mat.shape[1], 1.0 / P_click_mat.shape[1], dtype=np.float32)  # click ailesi
O_WEIGHTS = np.full(P_order_mat.shape[1], 1.0 / P_order_mat.shape[1], dtype=np.float32)  # order ailesi

assert isclose(C_WEIGHTS.sum(), 1.0, rel_tol=1e-6)
assert isclose(O_WEIGHTS.sum(), 1.0, rel_tol=1e-6)
assert (C_WEIGHTS >= 0).all() and (O_WEIGHTS >= 0).all()

# Aile içi karışımlar
va_df["p_click"] = P_click_mat @ C_WEIGHTS
va_df["p_order"] = P_order_mat @ O_WEIGHTS

# (Opsiyonel) her modeli tekil olarak da raporla (diagnostic)
if False:
  def _session_auc(df, y_col, s_col):
      aucs = []
      for _, g in df.groupby(group_col, sort=False):
          y = pd.to_numeric(g[y_col], errors="coerce").fillna(0)
          s = pd.to_numeric(g[s_col], errors="coerce").fillna(0.0)
          if y.sum() == 0 or y.sum() == len(y):  # tek sınıfı atla
              continue
          try:
              aucs.append(roc_auc_score(y, s))
          except Exception:
              pass
      return float(np.mean(aucs)) if aucs else np.nan, len(aucs)

  print("\n[Diag] Tekil model AUC'leri (valid @ full day):")
  for j in range(P_click_mat.shape[1]):
      va_df[f"p_click_{j+1}"] = P_click_mat[:, j]
      auc, n = _session_auc(va_df, label_clicked, f"p_click_{j+1}")
      print(f"  CLICK#{j+1}: ClickAUC={auc:.5f} on {n}")
  for j in range(P_order_mat.shape[1]):
      va_df[f"p_order_{j+1}"] = P_order_mat[:, j]
      auc, n = _session_auc(va_df, label_ordered, f"p_order_{j+1}")
      print(f"  ORDER#{j+1}: OrderAUC={auc:.5f} on {n}")

# ---------------- 7) BASELINE (sabit click-order ağırlıklarıyla) ----------------
print("\n[OOT] Predict & score (baseline blend inside families)...")
va_df["score"] = W_CLICK * va_df["p_click"] + W_ORDER * va_df["p_order"]
base_final, base_auc_c, base_auc_o, base_n_c, base_n_o = trendyol_cv_metric(va_df, "score")
print(f"[OOT] BASE  -> w_click={W_CLICK:.2f}, w_order={W_ORDER:.2f} | "
      f"FINAL={base_final:.5f} | ClickAUC={base_auc_c:.5f} on {base_n_c} | "
      f"OrderAUC={base_auc_o:.5f} on {base_n_o}")

# ---------------- 8) EXISTING click–order WEIGHT SWEEP ----------------
order_idx, starts, ends, n_groups = _prepare_group_slices_for_va(va_df)
y_click_sorted = pd.to_numeric(va_df[label_clicked], errors="coerce").fillna(0).to_numpy(np.int8)[order_idx]
y_order_sorted = pd.to_numeric(va_df[label_ordered], errors="coerce").fillna(0).to_numpy(np.int8)[order_idx]
p_click_sorted = va_df["p_click"].to_numpy(np.float32)[order_idx]
p_order_sorted = va_df["p_order"].to_numpy(np.float32)[order_idx]

res = weight_sweep_indexed(
    y_click_sorted, y_order_sorted,
    p_click_sorted, p_order_sorted,
    starts, ends,
    w_center=0.50, delta=0.24, step=0.02,   # 0.38..0.62 aralığı, 0.02 adım
    metr_w_click=float(W_CLICK), metr_w_order=float(W_ORDER),
    fold="OOT"
)

print(f"[OOT] BEST -> w_click={res['best_w_click']:.2f}, w_order={res['best_w_order']:.2f}, "
      f"FINAL={res['best_final']:.5f} | ClickAUC={res['best_cauc']:.5f} | "
      f"OrderAUC={res['best_oauc']:.5f}")

# ---------------- 9) Suggested global (tek split) ----------------
suggested_w_click = float(res["best_w_click"])
suggested_w_order = float(1.0 - suggested_w_click)
print(f"Suggested GLOBAL weights -> w_click={suggested_w_click:.3f}, w_order={suggested_w_order:.3f}")

# (opsiyonel) önerilen ağırlıkla skoru tekrar göster
va_df["score"] = suggested_w_click*va_df["p_click"] + suggested_w_order*va_df["p_order"]
sg_final, sg_auc_c, sg_auc_o, sg_n_c, sg_n_o = trendyol_cv_metric(va_df, "score")
print(f"[OOT] SUGGESTED -> FINAL={sg_final:.5f} | ClickAUC={sg_auc_c:.5f} on {sg_n_c} | "
      f"OrderAUC={sg_auc_o:.5f} on {sg_n_o}")

gc.collect()

[TimeSplit] train_ts=[1751821, 1751994] | val_ts=1752166

[OOT] ORDER-train: 687240 rows / 4468 sessions (BOTH)
[OOT] CLICK-train: 1796756 rows / 14208 sessions
[OOT] valid      : 977049 rows / 7594 sessions
[OOT] feature count: 163

[OOT] Training CLICK rankers (multi-HP)...
  - CLICK model #1 with params idx=1


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6227392	best: 0.6227392 (0)	total: 259ms	remaining: 21m 34s
250:	test: 0.6553269	best: 0.6553269 (250)	total: 24s	remaining: 7m 34s
500:	test: 0.6581813	best: 0.6581813 (500)	total: 48.4s	remaining: 7m 14s
750:	test: 0.6598675	best: 0.6598675 (750)	total: 1m 13s	remaining: 6m 54s
1000:	test: 0.6609063	best: 0.6609408 (985)	total: 1m 38s	remaining: 6m 33s
1250:	test: 0.6619559	best: 0.6619559 (1250)	total: 2m 3s	remaining: 6m 11s
1500:	test: 0.6625149	best: 0.6625149 (1500)	total: 2m 29s	remaining: 5m 48s
1750:	test: 0.6626333	best: 0.6626559 (1720)	total: 2m 54s	remaining: 5m 23s
2000:	test: 0.6627696	best: 0.6627755 (1935)	total: 3m 19s	remaining: 4m 58s
2250:	test: 0.6631051	best: 0.6631051 (2250)	total: 3m 44s	remaining: 4m 34s
2500:	test: 0.6632627	best: 0.6633663 (2470)	total: 4m 9s	remaining: 4m 8s
2750:	test: 0.6632823	best: 0.6634202 (2570)	total: 4m 33s	remaining: 3m 43s
3000:	test: 0.6635320	best: 0.6635741 (2985)	total: 4m 59s	remaining: 3m 19s
3250:	test: 0.66371

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6227401	best: 0.6227401 (0)	total: 255ms	remaining: 21m 15s
250:	test: 0.6561637	best: 0.6561637 (250)	total: 24.1s	remaining: 7m 36s
500:	test: 0.6589360	best: 0.6589360 (500)	total: 48.5s	remaining: 7m 15s
750:	test: 0.6605293	best: 0.6605293 (750)	total: 1m 13s	remaining: 6m 56s
1000:	test: 0.6615269	best: 0.6615710 (980)	total: 1m 39s	remaining: 6m 36s
1250:	test: 0.6621237	best: 0.6621880 (1215)	total: 2m 4s	remaining: 6m 12s
1500:	test: 0.6623386	best: 0.6624028 (1440)	total: 2m 29s	remaining: 5m 48s
1750:	test: 0.6625289	best: 0.6625289 (1750)	total: 2m 54s	remaining: 5m 23s
2000:	test: 0.6627704	best: 0.6627747 (1960)	total: 3m 19s	remaining: 4m 59s
2250:	test: 0.6628548	best: 0.6628548 (2250)	total: 3m 44s	remaining: 4m 34s
2500:	test: 0.6629495	best: 0.6630097 (2340)	total: 4m 9s	remaining: 4m 9s
2750:	test: 0.6631332	best: 0.6631378 (2675)	total: 4m 34s	remaining: 3m 44s
3000:	test: 0.6630910	best: 0.6632610 (2920)	total: 4m 59s	remaining: 3m 19s
3250:	test: 0.663

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6352415	best: 0.6352415 (0)	total: 373ms	remaining: 12m 24s
250:	test: 0.6571803	best: 0.6571803 (250)	total: 52.6s	remaining: 6m 6s
500:	test: 0.6605489	best: 0.6605489 (500)	total: 1m 46s	remaining: 5m 17s
750:	test: 0.6615368	best: 0.6615368 (750)	total: 2m 40s	remaining: 4m 26s
1000:	test: 0.6622563	best: 0.6622593 (975)	total: 3m 34s	remaining: 3m 33s
1250:	test: 0.6625804	best: 0.6626603 (1235)	total: 4m 27s	remaining: 2m 40s
1500:	test: 0.6628248	best: 0.6628248 (1500)	total: 5m 21s	remaining: 1m 46s
1750:	test: 0.6631191	best: 0.6632252 (1645)	total: 6m 15s	remaining: 53.3s
1999:	test: 0.6630697	best: 0.6632785 (1805)	total: 7m 8s	remaining: 0us
bestTest = 0.6632784989
bestIteration = 1805
Shrink model to first 1806 iterations.

[OOT] Training ORDER rankers (multi-HP)...
  - ORDER model #1 with params idx=1


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6808090	best: 0.6808090 (0)	total: 74.4ms	remaining: 2m 28s
250:	test: 0.7270517	best: 0.7270517 (250)	total: 10.5s	remaining: 1m 13s
500:	test: 0.7306283	best: 0.7306283 (500)	total: 20.9s	remaining: 1m 2s
750:	test: 0.7327651	best: 0.7327651 (750)	total: 31.3s	remaining: 52s
1000:	test: 0.7332534	best: 0.7333963 (990)	total: 41.6s	remaining: 41.5s
1250:	test: 0.7334585	best: 0.7341077 (1170)	total: 52s	remaining: 31.1s
1500:	test: 0.7345226	best: 0.7345783 (1485)	total: 1m 2s	remaining: 20.7s
1750:	test: 0.7344974	best: 0.7348494 (1650)	total: 1m 12s	remaining: 10.3s
1999:	test: 0.7336996	best: 0.7348494 (1650)	total: 1m 23s	remaining: 0us
bestTest = 0.7348494115
bestIteration = 1650
Shrink model to first 1651 iterations.
  - ORDER model #2 with params idx=2


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6806883	best: 0.6806883 (0)	total: 72.3ms	remaining: 3m 36s
250:	test: 0.7188341	best: 0.7189114 (245)	total: 10.5s	remaining: 1m 55s
500:	test: 0.7245328	best: 0.7245328 (500)	total: 20.9s	remaining: 1m 44s
750:	test: 0.7275784	best: 0.7275784 (750)	total: 31.3s	remaining: 1m 33s
1000:	test: 0.7288931	best: 0.7289630 (990)	total: 41.7s	remaining: 1m 23s
1250:	test: 0.7297116	best: 0.7297116 (1250)	total: 52.1s	remaining: 1m 12s
1500:	test: 0.7306038	best: 0.7306038 (1500)	total: 1m 2s	remaining: 1m 2s
1750:	test: 0.7312246	best: 0.7313305 (1675)	total: 1m 12s	remaining: 51.9s
2000:	test: 0.7316485	best: 0.7316610 (1975)	total: 1m 23s	remaining: 41.5s
2250:	test: 0.7317707	best: 0.7319117 (2105)	total: 1m 33s	remaining: 31.1s
2500:	test: 0.7322071	best: 0.7323863 (2465)	total: 1m 43s	remaining: 20.7s
2750:	test: 0.7320683	best: 0.7323863 (2465)	total: 1m 54s	remaining: 10.3s
2999:	test: 0.7325531	best: 0.7325773 (2880)	total: 2m 4s	remaining: 0us
bestTest = 0.7325772646
best

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6806894	best: 0.6806894 (0)	total: 74.4ms	remaining: 2m 28s
250:	test: 0.7299193	best: 0.7299835 (240)	total: 10.5s	remaining: 1m 13s
500:	test: 0.7323595	best: 0.7323595 (500)	total: 20.9s	remaining: 1m 2s
750:	test: 0.7334609	best: 0.7334609 (750)	total: 31.2s	remaining: 52s
1000:	test: 0.7322297	best: 0.7334609 (750)	total: 41.6s	remaining: 41.5s
1250:	test: 0.7309826	best: 0.7334609 (750)	total: 52s	remaining: 31.1s
1500:	test: 0.7305223	best: 0.7334609 (750)	total: 1m 2s	remaining: 20.7s
1750:	test: 0.7281598	best: 0.7334609 (750)	total: 1m 12s	remaining: 10.3s
1999:	test: 0.7272532	best: 0.7334609 (750)	total: 1m 22s	remaining: 0us
bestTest = 0.7334608863
bestIteration = 750
Shrink model to first 751 iterations.

[OOT] Predict & score (baseline blend inside families)...
[OOT] BASE  -> w_click=0.30, w_order=0.70 | FINAL=0.70419 | ClickAUC=0.63535 on 7594 | OrderAUC=0.73370 on 2421

[Fold OOT] Weight sweep around 0.50
 w_click  w_order   FINAL      ClickAUC   OrderAUC   

0

In [30]:
if False:

    def plot_cb_fi(model, pool=None, top_n=20, method="PredictionValuesChange", title="Feature Importance"):
        """
        CatBoost feature importance'ı güvenli şekilde çıkarır:
        - Tercihen prettified=True kullanır (isimleri model/pool'dan alır)
        - Gerekirse model.feature_names_ ya da f0..fN fallback'i yapar
        """
        # 1) Prettified ile dene (isimleri CatBoost versin)
        try:
            if pool is not None:
                fi_pretty = model.get_feature_importance(type=method, data=pool, prettified=True)
            else:
                fi_pretty = model.get_feature_importance(type=method, prettified=True)
            # CatBoost kolon adlarını normalize et
            col_map = {c: c for c in fi_pretty.columns}
            # olası isim varyasyonları:
            for k in ["Feature Name", "Feature", "feature"]:
                if k in fi_pretty.columns:
                    col_map[k] = "feature"
            for k in ["Feature Importance", "Importances", "importance"]:
                if k in fi_pretty.columns:
                    col_map[k] = "importance"
            fi_df = fi_pretty.rename(columns=col_map)[["feature", "importance"]]
        except Exception:
            # 2) Ham importance + model.feature_names_ ile kur
            imps = model.get_feature_importance(type=method)
            names = getattr(model, "feature_names_", None)
            if names is None or len(names) != len(imps):
                names = [f"f{i}" for i in range(len(imps))]
            fi_df = pd.DataFrame({"feature": names, "importance": imps})

        fi_top = fi_df.sort_values("importance", ascending=False).head(top_n)

        plt.figure(figsize=(8, 6))
        plt.barh(fi_top["feature"], fi_top["importance"])
        plt.gca().invert_yaxis()
        plt.title(title)
        plt.xlabel(method)
        plt.tight_layout()
        plt.show()
        return fi_top

    # Kullanım (Yol A = PredictionValuesChange, data vermek zorunda değilsin)
    fi_click = plot_cb_fi(m_click, pool=None, top_n=20, method="PredictionValuesChange", title="m_click FI (PVC)")
    fi_order = plot_cb_fi(m_order, pool=None, top_n=20, method="PredictionValuesChange", title="m_order FI (PVC)")
    print(fi_click)
    print(fi_order)

In [31]:
if False:

    # "0 importance" eşiği (mutlak değer <= EPS → 0 kabul)
    EPS = 1e-12

    def _fi_df(model):
        """CatBoost FI'yi (PredictionValuesChange) 'feature'/'importance' kolonlarıyla döndürür."""
        fi = model.get_feature_importance(type="PredictionValuesChange", prettified=True)
        cols_lower = [c.lower() for c in fi.columns]
        # feature kolonu
        if "feature" in cols_lower:
            fcol = fi.columns[cols_lower.index("feature")]
        elif "feature name" in cols_lower:
            fcol = fi.columns[cols_lower.index("feature name")]
        elif "feature id" in cols_lower:
            fcol = fi.columns[cols_lower.index("feature id")]
        else:
            fcol = fi.columns[0]
        # importance kolonu
        if "importance" in cols_lower:
            icol = fi.columns[cols_lower.index("importance")]
        elif "importances" in cols_lower:
            icol = fi.columns[cols_lower.index("importances")]
        else:
            icol = fi.columns[1]

        out = fi[[fcol, icol]].copy()
        out.columns = ["feature", "importance"]

        # Eğer "feature" integer id ise isimlere map et
        if out["feature"].dtype != object and "feature_cols" in globals():
            names = list(feature_cols)
            out["feature"] = out["feature"].astype(int).map(lambda i: names[i] if 0 <= i < len(names) else f"f_{i}")
        return out

    # Click & Order FI
    fi_click_df = _fi_df(m_click)
    fi_order_df = _fi_df(m_order)

    # Tüm feature evreni (mümkünse feature_cols sırasını koru)
    base_feats = (list(feature_cols)
                  if "feature_cols" in globals()
                  else sorted(set(fi_click_df["feature"]) | set(fi_order_df["feature"])))

    # Tek tabloya birleştir
    fi_all = pd.DataFrame({"feature": base_feats})
    fi_all = fi_all.merge(
        fi_click_df.groupby("feature", as_index=False)["importance"].sum()
                   .rename(columns={"importance": "imp_click"}),
        on="feature", how="left"
    )
    fi_all = fi_all.merge(
        fi_order_df.groupby("feature", as_index=False)["importance"].sum()
                   .rename(columns={"importance": "imp_order"}),
        on="feature", how="left"
    )
    fi_all[["imp_click", "imp_order"]] = fi_all[["imp_click", "imp_order"]].fillna(0.0)

    # Her iki modelde de 0 (veya ~0) önemliler
    mask_zero_both = (fi_all["imp_click"].abs() <= EPS) & (fi_all["imp_order"].abs() <= EPS)
    zero_both = fi_all.loc[mask_zero_both, "feature"].tolist()

    # Cat vs Num ayrımı
    cat_set = set(cat_features) if "cat_features" in globals() else set()
    zero_cat = [f for f in zero_both if f in cat_set]
    zero_num = [f for f in zero_both if f not in cat_set]

    # Çıktılar
    print(f"[Zero FI in BOTH models] {len(zero_both)} / {len(base_feats)} features")
    print(f"- CATEGORICAL ({len(zero_cat)}):")
    print(zero_cat)
    print(f"\n- NUMERIC ({len(zero_num)}):")
    print(zero_num)