In [None]:
import polars as pl
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import gc
from sklearn.metrics import roc_auc_score
from catboost import CatBoostRanker, Pool

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_parquet("train_fe_180.parquet")
test  = pd.read_parquet("test_fe_180.parquet")

In [4]:
train.head()

Unnamed: 0,ts_timestamp,search_term_normalized,content_id_hashed,session_id,clicked,ordered,c_search_imp_mean,c_search_clk_mean,c_search_imp_std,c_search_clk_std,...,filterable_z_in_leaf,search_ctr_bs_sess_minmax,u_search_imp_std_sess_minmax,c_tterm_clk_mean_sess_minmax,u_tterm_lift_sess_minmax,click_to_order_rate_sess_minmax,c_tterm_clk_std_sess_minmax,search_ctr_sess_minmax,price_diff_from_session_med_sess_minmax,u_search_imp_mean_sess_minmax
0,1752166,a_elbise_kesim,ae9b536d26e602f4,train_4fd3705b497bbe4f,1.0,0.0,0.00289,5.4e-05,0.001449,4.2e-05,...,0.0,0.007812,0.0,1.0,0.0,0.073975,1.0,0.076355,0.274414,0.0
1,1752166,a_elbise_kesim,91b8a11e4c8e0ded,train_4fd3705b497bbe4f,0.0,0.0,0.010849,0.000461,0.007557,0.00032,...,0.333252,0.127441,0.0,1.0,0.0,0.017548,1.0,0.413574,0.317627,0.0
2,1752166,a_elbise_kesim,1a78e29f4bcc58b0,train_4fd3705b497bbe4f,0.0,0.0,0.003828,0.00011,0.002554,7.4e-05,...,0.333252,0.025894,0.0,1.0,0.0,0.144531,1.0,0.215942,0.734375,0.0
3,1752166,a_elbise_kesim,604a4c8b13d798a4,train_4fd3705b497bbe4f,0.0,0.0,0.009636,0.000293,0.009766,0.000353,...,0.0,0.051208,0.0,1.0,0.0,0.148193,1.0,0.241821,0.187866,0.0
4,1752166,a_elbise_kesim,038680d15f693ae2,train_4fd3705b497bbe4f,0.0,0.0,0.010582,0.000409,0.003056,0.000113,...,-0.166626,0.102356,0.0,1.0,0.0,0.037598,1.0,0.358643,0.674805,0.0


In [None]:
%%time
if True:

    def reduce_mem_usage(df: pd.DataFrame, use_categorical=True, verbose=True):
        start_mem = df.memory_usage(deep=True).sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtype
            if pd.api.types.is_numeric_dtype(col_type):
                c_min, c_max = df[col].min(), df[col].max()
                if pd.api.types.is_integer_dtype(col_type):
                    for t in (np.int8, np.int16, np.int32):
                        if c_min >= np.iinfo(t).min and c_max <= np.iinfo(t).max:
                            df[col] = df[col].astype(t); break
                else:
                    for t in (np.float16, np.float32):
                        df[col] = df[col].astype(t)
                        # İsterseniz hataya karşı: df[col] = pd.to_numeric(df[col], downcast="float")
            elif use_categorical and df[col].dtype == "object":
                # Sık tekrar eden metinlerde çok etkili
                num_unique = df[col].nunique(dropna=False)
                num_total  = len(df[col])
                if num_unique / num_total < 0.5:
                    df[col] = df[col].astype("category")
        end_mem = df.memory_usage(deep=True).sum() / 1024**2
        if verbose:
            print(f"Mem {start_mem:.1f} MB → {end_mem:.1f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% kazanç)")
        return df

        train = reduce_mem_usage(train)
        test  = reduce_mem_usage(test)        

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 10.7 µs


In [None]:
# ---------------- CONFIG ----------------
num_features = [
     'affinity_lift',
     'attr_option_per_type',
     'attr_type_z_in_leaf',
     'attribute_type_count',
     'c_cart_mean',
     'c_click_cv',
     'c_click_mean',
     'c_fav_mean',
     'c_order_mean',
     'c_search_clk_mean',
     'c_search_clk_std',
     'c_search_cv',
     'c_search_imp_mean',
     'c_search_imp_std',
     'c_tterm_clk_mean',
     'c_tterm_clk_mean_sess_minmax',
     'c_tterm_clk_std',
     'c_tterm_clk_std_sess_minmax',
     'c_tterm_imp_mean',
     'c_tterm_imp_std',
     'cart_to_order_bs',
     'category_vs_user_pref_ratio',
     'click_to_order_bs',
     'click_to_order_rate',
     'click_to_order_rate_sess_minmax',
     'content_rate_avg',
     'content_rate_count',
     'content_review_count',
     'content_review_wth_media_count',
     'dayofweek',
     'disc_gap_to_item_pref',
     'disc_z_in_l1',
     'disc_z_in_leaf',
     'discount_diff_from_session_max',
     'discount_norm_in_session',
     'discount_rank_in_category',
     'discount_rank_in_session',
     'discount_ratio',
     'discount_vs_user_pref_proxy',
     'discounted_price',
     'fav_to_order_bs',
     'filterable_label_count',
     'filterable_z_in_leaf',
     'global_l1_click_sum',
     'global_l1_order_rate',
     'hour',
     'item_disc_med_ordered',
     'item_price_med_ordered',
     'media_review_share',
     'merchant_count',
     'merchant_z_in_leaf',
     'order_trend',
     'original_price',
     'popularity_ratio_recent',
     'price_diff_from_session_med',
     'price_diff_from_session_med_sess_minmax',
     'price_gap_to_item_pref',
     'price_norm_in_session',
     'price_rank_in_category',
     'price_rank_in_session',
     'price_vs_user_pref_proxy',
     'price_z_in_l1',
     'price_z_in_leaf',
     'quality_norm_in_session',
     'quality_rank_in_session',
     'quality_z_in_l1',
     'quality_z_in_leaf',
     'rate_diff_from_session_max',
     'rate_weighted',
     'same_l1_count_in_session',
     'same_l1_share_in_session',
     'same_leaf_count_in_session',
     'same_leaf_share_in_session',
     'scarcity_index',
     'search_ctr',
     'search_ctr_bs',
     'search_ctr_bs_sess_minmax',
     'search_ctr_sess_minmax',
     'selling_price',
     'session_category_entropy',
     'session_category_match_ratio',
     'session_elapsed_min',
     'session_elapsed_sec',
     'session_price_diff_from_avg',
     'session_price_iqr',
     'session_size',
     't_clk_mean',
     't_clk_std',
     't_ctr_global',
     't_imp_mean',
     't_imp_std',
     'tc_lift_vs_content',
     'tc_lift_vs_term',
     'tc_term_ctr',
     'total_attribute_option_count',
     'u_c_click_rate',
     'u_c_order_rate',
     'u_cart_mean',
     'u_cart_std',
     'u_click_cv',
     'u_click_mean',
     'u_click_std',
     'u_fav_mean',
     'u_fav_std',
     'u_l1_click_mean',
     'u_l1_order_mean',
     'u_l1_order_rate',
     'u_leaf_click_mean',
     'u_leaf_click_rate',
     'u_leaf_order_mean',
     'u_order_cv',
     'u_order_mean',
     'u_order_std',
     'u_search_clk_mean',
     'u_search_clk_std',
     'u_search_imp_mean',
     'u_search_imp_mean_sess_minmax',
     'u_search_imp_std',
     'u_search_imp_std_sess_minmax',
     'u_tterm_clk_mean',
     'u_tterm_clk_std',
     'u_tterm_ctr',
     'u_tterm_cv',
     'u_tterm_imp_mean',
     'u_tterm_imp_std',
     'u_tterm_lift',
     'u_tterm_lift_sess_minmax',
     'uf_u_clk_mean',
     'uf_u_clk_std',
     'uf_u_imp_mean',
     'uf_u_imp_std',
     'uf_uc_clk_mean',
     'uf_uc_clk_std',
     'uf_uc_imp_mean',
     'uf_uc_imp_std',
     'ufs_u_cart_mean',
     'ufs_u_cart_std',
     'ufs_u_click_mean',
     'ufs_u_click_std',
     'ufs_u_fav_mean',
     'ufs_u_fav_std',
     'ufs_u_order_mean',
     'ufs_u_order_std',
     'ufs_uc_cart_mean',
     'ufs_uc_cart_std',
     'ufs_uc_click_mean',
     'ufs_uc_click_std',
     'ufs_uc_fav_mean',
     'ufs_uc_fav_std',
     'ufs_uc_order_mean',
     'ufs_uc_order_std',
     'user_age_years',
     'user_birth_year',
     'user_item_cart_to_order_rate',
     'user_item_click_to_cart_rate',
     'user_item_click_to_order_rate',
     'user_tenure_in_days']

cat_features = [
     'c_order_mean_bin',
     'c_order_mean_bin__x__user_gender',
     'content_id_hashed',
     'cv_tags',
     'discount_ratio_bin',
     'discount_ratio_bin__x__c_order_mean_bin',
     'discount_ratio_bin__x__selling_price_bin',
     'discount_ratio_bin__x__user_gender',
     'l1_hour',
     'l1_searchterm',
     'l2_hour',
     'leaf_category_name',
     'leaf_searchterm',
     'level1_category_name',
     'level2_category_name',
     'search_term_normalized',
     'selling_price_bin',
     'selling_price_bin__x__c_order_mean_bin',
     'selling_price_bin__x__user_gender',
     'user_gender']

# Listeye ekle
feature_cols = num_features + cat_features

# Label & grup/id kolonları
label_clicked = "clicked"
label_ordered = "ordered"
group_col     = "session_id"
id_col        = "content_id_hashed"

# ---------------- Güvenlik kontrolleri ----------------
# train/test içinde olmayan feature'ları ayıkla (özellikle hızlı iterasyonlarda faydalı)
_missing_in_train = [c for c in feature_cols if c not in train.columns]
if _missing_in_train:
    print("Uyarı: train'de eksik bulunan kolonlar atılıyor:", _missing_in_train)
    feature_cols = [c for c in feature_cols if c in train.columns]
    # cat_features'ı da filtrele
    cat_features = [c for c in cat_features if c in feature_cols]

In [7]:
%%time
if False:
    # ---- Ayarlar ----
    corr_thresh = 0.99
    protect = {group_col, label_clicked, label_ordered}
    # varsa id_col'ü de koru:
    if 'id_col' in globals():
        protect.add(id_col)
    
    # Sadece feature'lar içinde ve numeric olanları kullan
    usable_feats = [c for c in feature_cols if c not in protect]
    num_feats = [c for c in usable_feats if pd.api.types.is_numeric_dtype(train[c])]
    
    # --- Korelasyon (train, leakage yok) ---
    corr = train[num_feats].corr().abs()
    
    # Üst üçgen yöntemi: soldakini tut, sağdakileri at
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if (upper[col] > corr_thresh).any()]
    
    print(f"[CorrFilter] threshold={corr_thresh} | "
          f"numeric_feats={len(num_feats)} | drop_count={len(to_drop)}")
    
    # İstersen hangi çiftlerden dolayı atıldığına kısa özet:
    dropped_pairs = []
    for col in to_drop:
        partners = upper.index[upper[col] > corr_thresh].tolist()
        for p in partners:
            dropped_pairs.append((p, col, float(corr.loc[p, col])))
    dropped_pairs = sorted(dropped_pairs, key=lambda x: -x[2])[:20]
    for a,b,v in dropped_pairs:
        print(f" drop '{b}' (kept '{a}') corr={v:.3f}")
    
    # --- Train/Test'ten at ---
    train.drop(columns=[c for c in to_drop if c in train.columns], inplace=True)
    test.drop(columns=[c for c in to_drop if c in test.columns], inplace=True)
    
    # --- Listeleri güncelle ---
    feature_cols = [c for c in feature_cols if c not in to_drop]
    if 'cat_features' in globals():
        cat_features = [c for c in cat_features if c not in to_drop]
    
    print(f"[CorrFilter] feature_cols -> {len(feature_cols)} cols after drop")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.34 µs


In [8]:
if True:
    # 1) Train setindeki sabit (constant) kolonları bul
    const_cols = [col for col in train.columns if train[col].nunique(dropna=False) <= 1]

    print("Constant columns:", const_cols)

    # 2) Train ve Test setlerinden bu kolonları düşür
    train = train.drop(columns=const_cols)
    test  = test.drop(columns=const_cols)

    # 3) Feature listelerinden de düşür
    feature_cols = [c for c in feature_cols if c not in const_cols]
    cat_features = [c for c in cat_features if c not in const_cols]

Constant columns: ['session_elapsed_sec', 'hour', 'session_elapsed_min', 'popularity_ratio_recent', 'order_trend', 'cart_to_order_bs', 'fav_to_order_bs', 'u_search_imp_std_sess_minmax', 'u_tterm_lift_sess_minmax', 'u_search_imp_mean_sess_minmax']


In [None]:
# NaN kategorikleri "unk" ile doldur (train & test)
UNK = "unknown"

# Sadece modelde kullanılacak ve mevcut olan kategorikler
cat_cols = [c for c in cat_features if c in feature_cols and c in train.columns and c in test.columns]

def fill_cat_nans(df, cols, unk=UNK):
    for c in cols:
        # hepsini string/object yap ve NaN'ları "unk" ile doldur
        df[c] = df[c].astype("object")
        df[c] = df[c].where(pd.notna(df[c]), unk).astype(str)
    return df

train = fill_cat_nans(train, cat_cols)
test  = fill_cat_nans(test,  cat_cols)

# (opsiyonel) kontrol
print("train NaN kalan cat cols:", [c for c in cat_cols if train[c].isna().any()])
print("test NaN kalan cat cols:",  [c for c in cat_cols if test[c].isna().any()])

train NaN kalan cat cols: []
test NaN kalan cat cols: []


In [10]:
# ---------------- METRIC ----------------
W_CLICK, W_ORDER = 0.3, 0.7

def _auc_safe(y, s):
    if y.sum() == 0 or y.sum() == len(y):
        return None
    return roc_auc_score(y, s)

def session_mean_auc_pd(df, y_col, score_col):
    aucs = []
    for _, g in df.groupby(group_col, sort=False):
        if g[y_col].sum() == 0 or g[y_col].sum() == len(g[y_col]):
            continue
        auc = _auc_safe(g[y_col].to_numpy(), g[score_col].to_numpy())
        if auc is not None:
            aucs.append(auc)
    return (float(np.mean(aucs)) if aucs else np.nan, len(aucs))

def trendyol_cv_metric(df, score_col="score"):
    click_mean, n_click = session_mean_auc_pd(df, label_clicked, score_col)
    order_mean, n_order = session_mean_auc_pd(df, label_ordered, score_col)
    final = W_CLICK * click_mean + W_ORDER * order_mean
    return final, click_mean, order_mean, n_click, n_order

# ---------------- MODEL (CatBoostRanker, pairwise) ----------------
clicked_params_1 = dict(
    depth=6,
    iterations=5000,
    learning_rate=0.02,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=3,
)

clicked_params_2 = dict(
    depth=6,
    iterations=5000,
    learning_rate=0.025,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=1,
    l2_leaf_reg=29.7,
    min_data_in_leaf=651,
)

clicked_params_3 = dict(
    depth=7,
    iterations=2000,
    learning_rate=0.03,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=1,
    l2_leaf_reg=20.6,
    min_data_in_leaf=628,
)

ordered_params_1 = dict(
    depth=6,
    iterations=2000,
    learning_rate=0.02,
    l2_leaf_reg=35.5,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    # od_wait=1000,
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=2,
    min_data_in_leaf=2264,
)

ordered_params_2 = dict(
    depth=6,
    iterations=3000,
    learning_rate=0.007,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=3,
)

ordered_params_3 = dict(
    depth=6,
    iterations=2000,
    learning_rate=0.034747183605270626,
    l2_leaf_reg=3.314322202814573,
    loss_function="PairLogitPairwise",
    eval_metric="QueryAUC",
    random_seed=42,
    verbose=250,
    od_type="Iter",
    task_type="GPU",
    devices="0",
    border_count=128,
    one_hot_max_size=2,
    max_ctr_complexity=2,
    min_data_in_leaf=2336,
)

def _make_pool(df: pd.DataFrame, y_col: str):
    X = df[feature_cols]
    y = pd.to_numeric(df[y_col], errors="coerce").fillna(0).astype(int).values
    # group_id: stabil int koda çevir (fold içinde)
    grp = pd.factorize(df[group_col], sort=False)[0].astype(np.int64)
    return Pool(data=X, label=y, group_id=grp, cat_features=cat_features)

def train_catboost_ranker(tr_df: pd.DataFrame, va_df: pd.DataFrame, y_col: str, params: dict):
    train_pool = _make_pool(tr_df, y_col)
    valid_pool = _make_pool(va_df, y_col)
    model = CatBoostRanker(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    del train_pool, valid_pool; gc.collect()
    return model

In [11]:
gc.collect()

0

In [12]:
train = train.reindex(sorted(train.columns), axis=1)
test  = test.reindex(sorted(test.columns), axis=1)

In [None]:
# %%time
print("[FullTrain] Validasyon ve weight opt yok; tüm train ile eğitim.")
tr_df_full = train.copy()

# --- ORDER için BOTH (click & order >=1) session filtresi ---
_tmp = tr_df_full[[group_col, label_clicked, label_ordered]].copy()
_tmp = _tmp[_tmp[group_col].notna()]
for lbl in (label_clicked, label_ordered):
    _tmp[lbl] = pd.to_numeric(_tmp[lbl], errors="coerce").fillna(0.0)

sess_flags = _tmp.groupby(group_col, sort=False)[[label_clicked, label_ordered]].max()
has_click = sess_flags[label_clicked] > 0
has_order = sess_flags[label_ordered] > 0
both_sessions = sess_flags.index[has_click & has_order]

# ORDER train: yalnızca BOTH; yoksa tüm train
if len(both_sessions) == 0:
    print("[WARN] Train'de BOTH session yok; ORDER için tüm train kullanılacak.")
    tr_df_order = tr_df_full.copy()
else:
    tr_df_order = tr_df_full[tr_df_full[group_col].isin(both_sessions)].copy()

# CLICK train: TÜM TRAIN (isteğin doğrultusunda)
tr_df_click = tr_df_full

print("\n" + "="*80)
print(f"[FullTrain] ORDER-train: {len(tr_df_order)} rows / {tr_df_order[group_col].nunique()} sessions (BOTH)")
print(f"[FullTrain] CLICK-train: {len(tr_df_click)} rows / {tr_df_click[group_col].nunique()} sessions")
print(f"[FullTrain] feature count: {len(feature_cols)}")

# --- Param listeleri (mevcut olanlar) ---
clicked_param_list = [clicked_params_1, clicked_params_2, clicked_params_3]
ordered_param_list = [ordered_params_1, ordered_params_2, ordered_params_3]

# --- Eğitim (eval_set olarak train'i veriyoruz; use_best_model gereği) ---
print("\n[FullTrain] ORDER modelleri eğitiliyor...")
m_order_list = []
for i, p in enumerate(ordered_param_list, 1):
    print(f"  - ORDER model #{i}")
    m = train_catboost_ranker(tr_df_order, tr_df_order, label_ordered, p)
    m_order_list.append(m)

print("\n[FullTrain] CLICK modelleri eğitiliyor (tüm train)...")
m_click_list = []
for i, p in enumerate(clicked_param_list, 1):
    print(f"  - CLICK model #{i}")
    m = train_catboost_ranker(tr_df_click, tr_df_click, label_clicked, p)
    m_click_list.append(m)

print("\n[FullTrain] Tamamlandı. m_order_list =", len(m_order_list), "| m_click_list =", len(m_click_list))
gc.collect()

[FullTrain] Validasyon ve weight opt yok; tüm train ile eğitim.

[FullTrain] ORDER-train: 1065737 rows / 6889 sessions (BOTH)
[FullTrain] CLICK-train: 2773805 rows / 21802 sessions
[FullTrain] feature count: 163

[FullTrain] ORDER modelleri eğitiliyor...
  - ORDER model #1


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6896508	best: 0.6896508 (0)	total: 13.3s	remaining: 7h 24m 7s
250:	test: 0.7391155	best: 0.7391155 (250)	total: 44.1s	remaining: 5m 7s
500:	test: 0.7540437	best: 0.7540437 (500)	total: 1m 14s	remaining: 3m 43s
750:	test: 0.7681395	best: 0.7681395 (750)	total: 1m 44s	remaining: 2m 54s
1000:	test: 0.7812747	best: 0.7812747 (1000)	total: 2m 14s	remaining: 2m 14s
1250:	test: 0.7941812	best: 0.7941812 (1250)	total: 2m 44s	remaining: 1m 38s
1500:	test: 0.8056633	best: 0.8056633 (1500)	total: 3m 14s	remaining: 1m 4s
1750:	test: 0.8164134	best: 0.8164134 (1750)	total: 3m 44s	remaining: 32s
1999:	test: 0.8271290	best: 0.8271290 (1999)	total: 4m 14s	remaining: 0us
bestTest = 0.8271290263
bestIteration = 1999
  - ORDER model #2


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6896680	best: 0.6896680 (0)	total: 183ms	remaining: 9m 8s
250:	test: 0.7269110	best: 0.7269110 (250)	total: 31.7s	remaining: 5m 46s
500:	test: 0.7345434	best: 0.7345434 (500)	total: 1m 2s	remaining: 5m 10s
750:	test: 0.7404849	best: 0.7404849 (750)	total: 1m 32s	remaining: 4m 38s
1000:	test: 0.7460914	best: 0.7461001 (998)	total: 2m 3s	remaining: 4m 6s
1250:	test: 0.7512926	best: 0.7512926 (1250)	total: 2m 33s	remaining: 3m 34s
1500:	test: 0.7566021	best: 0.7566021 (1500)	total: 3m 4s	remaining: 3m 3s
1750:	test: 0.7617807	best: 0.7617807 (1750)	total: 3m 34s	remaining: 2m 32s
2000:	test: 0.7667326	best: 0.7667326 (2000)	total: 4m 4s	remaining: 2m 2s
2250:	test: 0.7717345	best: 0.7717345 (2250)	total: 4m 34s	remaining: 1m 31s
2500:	test: 0.7768698	best: 0.7768763 (2497)	total: 5m 4s	remaining: 1m
2750:	test: 0.7819768	best: 0.7819768 (2750)	total: 5m 34s	remaining: 30.3s
2999:	test: 0.7865133	best: 0.7865133 (2999)	total: 6m 4s	remaining: 0us
bestTest = 0.7865133279
bestIter

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6897120	best: 0.6897120 (0)	total: 184ms	remaining: 6m 8s
250:	test: 0.7509989	best: 0.7509989 (250)	total: 30.8s	remaining: 3m 34s
500:	test: 0.7763441	best: 0.7763441 (500)	total: 1m	remaining: 3m 2s
750:	test: 0.7991615	best: 0.7991615 (750)	total: 1m 30s	remaining: 2m 31s
1000:	test: 0.8205502	best: 0.8205502 (1000)	total: 2m	remaining: 2m
1250:	test: 0.8403042	best: 0.8403042 (1250)	total: 2m 30s	remaining: 1m 30s
1500:	test: 0.8572657	best: 0.8572657 (1500)	total: 3m	remaining: 1m
1750:	test: 0.8724322	best: 0.8724322 (1750)	total: 3m 30s	remaining: 29.9s
1999:	test: 0.8856979	best: 0.8856979 (1999)	total: 4m	remaining: 0us
bestTest = 0.8856979476
bestIteration = 1999

[FullTrain] CLICK modelleri eğitiliyor (tüm train)...
  - CLICK model #1


Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6230746	best: 0.6230746 (0)	total: 826ms	remaining: 1h 8m 47s
250:	test: 0.6577208	best: 0.6577208 (250)	total: 1m 31s	remaining: 28m 45s
500:	test: 0.6618751	best: 0.6618751 (500)	total: 2m 57s	remaining: 26m 38s
750:	test: 0.6639404	best: 0.6639404 (750)	total: 4m 25s	remaining: 25m 2s
1000:	test: 0.6656858	best: 0.6656858 (1000)	total: 5m 52s	remaining: 23m 26s
1250:	test: 0.6673056	best: 0.6673123 (1248)	total: 7m 18s	remaining: 21m 55s
1500:	test: 0.6687926	best: 0.6687926 (1500)	total: 8m 46s	remaining: 20m 26s
1750:	test: 0.6701692	best: 0.6701692 (1750)	total: 10m 13s	remaining: 18m 57s
2000:	test: 0.6715627	best: 0.6715637 (1999)	total: 11m 40s	remaining: 17m 29s
2250:	test: 0.6730433	best: 0.6730433 (2250)	total: 13m 8s	remaining: 16m 2s
2500:	test: 0.6744355	best: 0.6744383 (2499)	total: 14m 35s	remaining: 14m 34s
2750:	test: 0.6757721	best: 0.6757721 (2750)	total: 16m 2s	remaining: 13m 7s
3000:	test: 0.6771768	best: 0.6771910 (2999)	total: 17m 30s	remaining: 11m 

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6230745	best: 0.6230745 (0)	total: 822ms	remaining: 1h 8m 29s
250:	test: 0.6590857	best: 0.6590857 (250)	total: 1m 30s	remaining: 28m 25s
500:	test: 0.6628952	best: 0.6628971 (496)	total: 2m 56s	remaining: 26m 28s
750:	test: 0.6652147	best: 0.6652147 (750)	total: 4m 23s	remaining: 24m 50s
1000:	test: 0.6672829	best: 0.6672848 (997)	total: 5m 49s	remaining: 23m 17s
1250:	test: 0.6691932	best: 0.6691932 (1250)	total: 7m 16s	remaining: 21m 48s
1500:	test: 0.6708338	best: 0.6708354 (1499)	total: 8m 43s	remaining: 20m 20s
1750:	test: 0.6725164	best: 0.6725164 (1750)	total: 10m 10s	remaining: 18m 53s
2000:	test: 0.6741791	best: 0.6741791 (2000)	total: 11m 38s	remaining: 17m 26s
2250:	test: 0.6758610	best: 0.6758610 (2250)	total: 13m 6s	remaining: 16m
2500:	test: 0.6775785	best: 0.6775785 (2500)	total: 14m 33s	remaining: 14m 33s
2750:	test: 0.6792067	best: 0.6792067 (2750)	total: 16m 1s	remaining: 13m 5s
3000:	test: 0.6808457	best: 0.6808540 (2997)	total: 17m 29s	remaining: 11m 38s

Default metric period is 5 because QueryAUC is/are not implemented for GPU
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6260579	best: 0.6260579 (0)	total: 1.13s	remaining: 37m 36s
250:	test: 0.6614238	best: 0.6614357 (249)	total: 2m 44s	remaining: 19m 9s
500:	test: 0.6659540	best: 0.6659540 (500)	total: 5m 28s	remaining: 16m 22s
750:	test: 0.6695255	best: 0.6695255 (750)	total: 8m 11s	remaining: 13m 37s
1000:	test: 0.6725159	best: 0.6725159 (1000)	total: 10m 55s	remaining: 10m 54s
1250:	test: 0.6760446	best: 0.6760447 (1249)	total: 13m 40s	remaining: 8m 11s
1500:	test: 0.6792816	best: 0.6792816 (1500)	total: 16m 25s	remaining: 5m 27s
1750:	test: 0.6826921	best: 0.6826921 (1750)	total: 19m 10s	remaining: 2m 43s
1999:	test: 0.6859977	best: 0.6859977 (1999)	total: 21m 54s	remaining: 0us
bestTest = 0.6859976784
bestIteration = 1999

[FullTrain] Tamamlandı. m_order_list = 3 | m_click_list = 3


0

In [None]:
# %%time
# ============================== SUBMISSION (Lists → Weight Blend) ==============================
# ---------------- PARAM ----------------
# Bunları istediğin gibi set edebilirsin:
W_CLICK, W_ORDER = 0.4, 0.6
group_col = "session_id"
id_col    = "content_id_hashed"
SUB_PATH  = "submission.csv"

# ---------------- TQDM Timer Wrapper ----------------
def timed_step(desc, func, *args, **kwargs):
    start = time.time()
    out = func(*args, **kwargs)
    tqdm.write(f"{desc} tamamlandı. Süre: {time.time() - start:.2f} sn")
    return out

# ---------------- GÜVENLİK KONTROLLERİ ----------------
assert 'test' in globals(), "test DataFrame'i yok."
assert 'feature_cols' in globals(), "feature_cols tanımlı değil."
assert 'cat_features' in globals(), "cat_features tanımlı değil."
assert 'm_click_list' in globals() and len(m_click_list) > 0, "m_click_list boş."
assert 'm_order_list' in globals() and len(m_order_list) > 0, "m_order_list boş."

# ---------------- 1) Tahminler (ENSEMBLE → eşit ağırlık) ----------------
test = test.copy()

# CatBoostRanker için tek Pool (inference'ta label yok)
test_pool = Pool(test[feature_cols], cat_features=cat_features)

def _pred(m, pool):
    return m.predict(pool)

# CLICK ailesi: tüm modeller → pred matris → ortalama
click_preds = []
for i, m in enumerate(m_click_list, 1):
    p = timed_step(f"CLICK model #{i} predict", _pred, m, test_pool)
    click_preds.append(p)
test["p_click"] = np.mean(np.column_stack(click_preds), axis=1).astype(np.float32)
tqdm.write(f"CLICK ensemble tamam: {len(click_preds)} model, equal-weight blend.")

# ORDER ailesi: tüm modeller → pred matris → ortalama
order_preds = []
for i, m in enumerate(m_order_list, 1):
    p = timed_step(f"ORDER model #{i} predict", _pred, m, test_pool)
    order_preds.append(p)
test["p_order"] = np.mean(np.column_stack(order_preds), axis=1).astype(np.float32)
tqdm.write(f"ORDER ensemble tamam: {len(order_preds)} model, equal-weight blend.")

# 2) Ağırlıklı final skor (senin w_click / w_order)
test["score"] = W_CLICK*test["p_click"] + W_ORDER*test["p_order"]
tqdm.write(f"Final skor hesaplandı → w_click={W_CLICK}, w_order={W_ORDER}")

# ---------------- 3) Pandas → Polars ----------------
test_pl = timed_step("Pandas → Polars", pl.from_pandas, test[[group_col, id_col, "score"]])

# ---------------- 4) Polars ile sıralama + grup birleştirme ----------------
submission_pl = timed_step(
    "Polars sort & groupby",
    lambda: (
        test_pl
        .sort([group_col, "score"], descending=[False, True])  # session asc, score desc
        .group_by(group_col, maintain_order=True)
        .agg(pl.col(id_col).cast(pl.Utf8).str.join(" ").alias("prediction"))
    )
)

# ---------------- 5) Polars → Pandas ve kontroller ----------------
submission = timed_step("Polars → Pandas", submission_pl.to_pandas)

tqdm.write("Kontroller yapılıyor...")
assert submission[group_col].nunique() == test[group_col].nunique(), "Eksik session var."
cnt_test = test.groupby(group_col)[id_col].size().to_numpy()
cnt_sub  = submission["prediction"].str.split().map(len).to_numpy()
assert (cnt_test == cnt_sub).all(), "Bazı sessionlarda item sayısı uyuşmuyor!"

# ---------------- 6) Kaydet ----------------
submission.rename(columns={group_col: "session_id"}, inplace=True)
submission.to_csv(SUB_PATH, index=False)
tqdm.write(f"Submission kaydedildi: {SUB_PATH}\n→ w_click={W_CLICK}, w_order={W_ORDER}")

CLICK model #1 predict tamamlandı. Süre: 7.51 sn
CLICK model #2 predict tamamlandı. Süre: 7.65 sn
CLICK model #3 predict tamamlandı. Süre: 4.20 sn
CLICK ensemble tamam: 3 model, equal-weight blend.
ORDER model #1 predict tamamlandı. Süre: 4.00 sn
ORDER model #2 predict tamamlandı. Süre: 4.97 sn
ORDER model #3 predict tamamlandı. Süre: 4.03 sn
ORDER ensemble tamam: 3 model, equal-weight blend.
Final skor hesaplandı → w_click=0.4, w_order=0.6
Pandas → Polars tamamlandı. Süre: 0.42 sn
Polars sort & groupby tamamlandı. Süre: 0.50 sn
Polars → Pandas tamamlandı. Süre: 0.14 sn
Kontroller yapılıyor...
Submission kaydedildi: /kaggle/working/submission.csv
→ w_click=0.4, w_order=0.6


In [15]:
submission.head()

Unnamed: 0,session_id,prediction
0,test_0001ff614df60933,89b1a5be8f804fe7 8565816477ed3fc5 6a878e9d3348...
1,test_00041895a35c4813,8f2de3f4559ae502 39ad83a925c864b2 e95b82f98246...
2,test_00058d4dc9727758,fe92918996a556d8 532f78e69e290328 f243ca14ab5e...
3,test_00093c8fc0637123,6d39569e6babf7d5 7abdaed8d6ad93f6 5103e2d3a09e...
4,test_000a319a9cf10e66,2829821e4fc763fc 5994dc6ce37bb774 7e5620113e28...
