In [None]:
import os, gc
import polars as pl
import pandas as pd
import numpy as np
import tqdm
from math import log
from itertools import combinations

In [None]:
%%time

pl.enable_string_cache()
BASE_PATH = "/kaggle/input/trendyol-e-ticaret-hackathonu-2025-kaggle/data"

# ---------------- LAZY LOAD (ts_date / ts_timestamp) ----------------
train_lazy = (
    pl.scan_parquet(f"{BASE_PATH}/train_sessions.parquet")
    .with_columns([
        pl.col("ts_hour").cast(pl.Datetime),
        pl.col("ts_hour").dt.date().alias("ts_date"),
        pl.col("ts_hour").dt.epoch(time_unit="s").alias("ts_timestamp"),
    ])
    .select([
        pl.col("ts_hour"),
        pl.col("ts_date"),
        pl.col("ts_timestamp"),
        pl.col("search_term_normalized").cast(pl.Categorical),
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("session_id").cast(pl.Categorical),
        pl.col("clicked").cast(pl.Int8),
        pl.col("ordered").cast(pl.Int8),
    ])
)

test_lazy = (
    pl.scan_parquet(f"{BASE_PATH}/test_sessions.parquet")
    .with_columns([
        pl.col("ts_hour").cast(pl.Datetime),
        pl.col("ts_hour").dt.date().alias("ts_date"),
        pl.col("ts_hour").dt.epoch(time_unit="s").alias("ts_timestamp"),
    ])
    .select([
        pl.col("ts_hour"),
        pl.col("ts_date"),
        pl.col("ts_timestamp"),
        pl.col("search_term_normalized").cast(pl.Categorical),
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("session_id").cast(pl.Categorical),
    ])
)

# ---------------- content/search_log ----------------
content_search_agg = (
    pl.scan_parquet(f"{BASE_PATH}/content/search_log.parquet")
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by("content_id_hashed")
    .agg([
        pl.col("total_search_impression").mean().alias("c_search_imp_mean"),
        pl.col("total_search_click").mean().alias("c_search_clk_mean"),
        pl.col("total_search_impression").std().alias("c_search_imp_std"),
        pl.col("total_search_click").std().alias("c_search_clk_std"),
    ])
)

# ---------------- content/sitewide_log ----------------
content_sitewide_agg = (
    pl.scan_parquet(f"{BASE_PATH}/content/sitewide_log.parquet")
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_cart").cast(pl.Float32),
        pl.col("total_fav").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .group_by("content_id_hashed")
    .agg([
        pl.col("total_click").mean().alias("c_click_mean"),
        pl.col("total_cart").mean().alias("c_cart_mean"),
        pl.col("total_fav").mean().alias("c_fav_mean"),
        pl.col("total_order").mean().alias("c_order_mean"),
    ])
)

# ---------------- content/price_rate_review_data ----------------
price_rate_review = (
    pl.scan_parquet(f"{BASE_PATH}/content/price_rate_review_data.parquet")
    .with_columns(pl.col("update_date").cast(pl.Datetime).dt.date().alias("ts_date"))
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("ts_date"),
        pl.col("original_price").cast(pl.Float32),
        pl.col("selling_price").cast(pl.Float32),
        pl.col("discounted_price").cast(pl.Float32),
        pl.col("content_review_count").cast(pl.Float32),
        pl.col("content_review_wth_media_count").cast(pl.Float32),
        pl.col("content_rate_count").cast(pl.Float32),
        pl.col("content_rate_avg").cast(pl.Float32),
    ])
    .group_by(["content_id_hashed", "ts_date"])
    .agg([
        pl.col("original_price").mean().alias("original_price"),
        pl.col("selling_price").mean().alias("selling_price"),
        pl.col("discounted_price").mean().alias("discounted_price"),
        pl.col("content_review_count").mean().alias("content_review_count"),
        pl.col("content_review_wth_media_count").mean().alias("content_review_wth_media_count"),
        pl.col("content_rate_count").mean().alias("content_rate_count"),
        pl.col("content_rate_avg").mean().alias("content_rate_avg"),
    ])
)

# ---------------- content/metadata ----------------
content_meta = (
    pl.scan_parquet(f"{BASE_PATH}/content/metadata.parquet")
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("level1_category_name").cast(pl.Categorical),
        pl.col("level2_category_name").cast(pl.Categorical),
        pl.col("leaf_category_name").cast(pl.Categorical),
        pl.col("attribute_type_count").cast(pl.Float32),
        pl.col("total_attribute_option_count").cast(pl.Float32),
        pl.col("merchant_count").cast(pl.Float32),
        pl.col("filterable_label_count").cast(pl.Float32),
        pl.col("content_creation_date").cast(pl.Datetime).dt.date().alias("content_creation_date"),
        pl.col("cv_tags").cast(pl.Utf8),
    ])
    .with_columns([
        (pl.col("total_attribute_option_count") / pl.when(pl.col("attribute_type_count") > 0)
           .then(pl.col("attribute_type_count")).otherwise(None)
        ).alias("attr_option_per_type")
    ])
)

# ---------------- user/metadata ----------------
user_meta = (
    pl.scan_parquet(f"{BASE_PATH}/user/metadata.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("user_gender").cast(pl.Categorical),
        pl.col("user_birth_year").cast(pl.Float32),
        pl.col("user_tenure_in_days").cast(pl.Int32),
    ])
)

# ---------------- user/fashion_search_log ----------------
# user x content
user_fashion_uc = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_search_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by(["user_id_hashed", "content_id_hashed"])
    .agg([
        pl.col("total_search_impression").mean().alias("uf_uc_imp_mean"),
        pl.col("total_search_click").mean().alias("uf_uc_clk_mean"),
        pl.col("total_search_impression").std().alias("uf_uc_imp_std"),
        pl.col("total_search_click").std().alias("uf_uc_clk_std"),
    ])
)
# user düzeyi
user_fashion_u = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_search_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by("user_id_hashed")
    .agg([
        pl.col("total_search_impression").mean().alias("uf_u_imp_mean"),
        pl.col("total_search_click").mean().alias("uf_u_clk_mean"),
        pl.col("total_search_impression").std().alias("uf_u_imp_std"),
        pl.col("total_search_click").std().alias("uf_u_clk_std"),
    ])
)

# ---------------- (NEW) user/search_log ----------------
user_search_u = (
    pl.scan_parquet(f"{BASE_PATH}/user/search_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by("user_id_hashed")
    .agg([
        pl.col("total_search_impression").mean().alias("u_search_imp_mean"),
        pl.col("total_search_click").mean().alias("u_search_clk_mean"),
        pl.col("total_search_impression").std().alias("u_search_imp_std"),
        pl.col("total_search_click").std().alias("u_search_clk_std"),
    ])
)

# ---------------- (NEW) user/sitewide_log ----------------
user_sitewide_u = (
    pl.scan_parquet(f"{BASE_PATH}/user/sitewide_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_cart").cast(pl.Float32),
        pl.col("total_fav").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .group_by("user_id_hashed")
    .agg([
        pl.col("total_click").mean().alias("u_click_mean"),
        pl.col("total_cart").mean().alias("u_cart_mean"),
        pl.col("total_fav").mean().alias("u_fav_mean"),
        pl.col("total_order").mean().alias("u_order_mean"),
        pl.col("total_click").std().alias("u_click_std"),
        pl.col("total_cart").std().alias("u_cart_std"),
        pl.col("total_fav").std().alias("u_fav_std"),
        pl.col("total_order").std().alias("u_order_std"),
    ])
)

# ---------------- (NEW) user/top_terms_log ----------------
user_top_terms_us = (
    pl.scan_parquet(f"{BASE_PATH}/user/top_terms_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("search_term_normalized").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by(["user_id_hashed", "search_term_normalized"])
    .agg([
        pl.col("total_search_impression").mean().alias("u_tterm_imp_mean"),
        pl.col("total_search_click").mean().alias("u_tterm_clk_mean"),
        pl.col("total_search_impression").std().alias("u_tterm_imp_std"),
        pl.col("total_search_click").std().alias("u_tterm_clk_std"),
    ])
)

# ---------------- (NEW) content/top_terms_log ----------------
content_top_terms_cs = (
    pl.scan_parquet(f"{BASE_PATH}/content/top_terms_log.parquet")
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("search_term_normalized").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by(["content_id_hashed", "search_term_normalized"])
    .agg([
        pl.col("total_search_impression").mean().alias("c_tterm_imp_mean"),
        pl.col("total_search_click").mean().alias("c_tterm_clk_mean"),
        pl.col("total_search_impression").std().alias("c_tterm_imp_std"),
        pl.col("total_search_click").std().alias("c_tterm_clk_std"),
    ])
)

# ---------------- (NEW) term/search_log ----------------
term_search_t = (
    pl.scan_parquet(f"{BASE_PATH}/term/search_log.parquet")
    .select([
        pl.col("search_term_normalized").cast(pl.Categorical),
        pl.col("total_search_impression").cast(pl.Float32),
        pl.col("total_search_click").cast(pl.Float32),
    ])
    .group_by("search_term_normalized")
    .agg([
        pl.col("total_search_impression").mean().alias("t_imp_mean"),
        pl.col("total_search_click").mean().alias("t_clk_mean"),
        pl.col("total_search_impression").std().alias("t_imp_std"),
        pl.col("total_search_click").std().alias("t_clk_std"),
    ])
)

# ---------------- (NEW) user/fashion_sitewide_log ----------------
# user x content
user_fashion_sitewide_uc = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_sitewide_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_cart").cast(pl.Float32),
        pl.col("total_fav").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .group_by(["user_id_hashed", "content_id_hashed"])
    .agg([
        pl.col("total_click").mean().alias("ufs_uc_click_mean"),
        pl.col("total_cart").mean().alias("ufs_uc_cart_mean"),
        pl.col("total_fav").mean().alias("ufs_uc_fav_mean"),
        pl.col("total_order").mean().alias("ufs_uc_order_mean"),
        pl.col("total_click").std().alias("ufs_uc_click_std"),
        pl.col("total_cart").std().alias("ufs_uc_cart_std"),
        pl.col("total_fav").std().alias("ufs_uc_fav_std"),
        pl.col("total_order").std().alias("ufs_uc_order_std"),
    ])
)
# user düzeyi
user_fashion_sitewide_u = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_sitewide_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_cart").cast(pl.Float32),
        pl.col("total_fav").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .group_by("user_id_hashed")
    .agg([
        pl.col("total_click").mean().alias("ufs_u_click_mean"),
        pl.col("total_cart").mean().alias("ufs_u_cart_mean"),
        pl.col("total_fav").mean().alias("ufs_u_fav_mean"),
        pl.col("total_order").mean().alias("ufs_u_order_mean"),
        pl.col("total_click").std().alias("ufs_u_click_std"),
        pl.col("total_cart").std().alias("ufs_u_cart_std"),
        pl.col("total_fav").std().alias("ufs_u_fav_std"),
        pl.col("total_order").std().alias("ufs_u_order_std"),
    ])
)

# ============ USER × CATEGORY (L1 ve LEAF) ============
# user_fashion_sitewide_log (user×content) + content_meta (content→kategori) birleşimiyle
user_cat_l1 = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_sitewide_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .join(  # content → L1
        pl.scan_parquet(f"{BASE_PATH}/content/metadata.parquet")
          .select([
              pl.col("content_id_hashed").cast(pl.Categorical),
              pl.col("level1_category_name").cast(pl.Categorical),
              pl.col("leaf_category_name").cast(pl.Categorical),
          ]),
        on="content_id_hashed", how="left"
    )
    .group_by(["user_id_hashed","level1_category_name"])
    .agg([
        pl.col("total_click").mean().alias("u_l1_click_mean"),
        pl.col("total_order").mean().alias("u_l1_order_mean"),
    ])
)

user_cat_leaf = (
    pl.scan_parquet(f"{BASE_PATH}/user/fashion_sitewide_log.parquet")
    .select([
        pl.col("user_id_hashed").cast(pl.Categorical),
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .join(
        pl.scan_parquet(f"{BASE_PATH}/content/metadata.parquet")
          .select([
              pl.col("content_id_hashed").cast(pl.Categorical),
              pl.col("leaf_category_name").cast(pl.Categorical),
          ]),
        on="content_id_hashed", how="left"
    )
    .group_by(["user_id_hashed","leaf_category_name"])
    .agg([
        pl.col("total_click").mean().alias("u_leaf_click_mean"),
        pl.col("total_order").mean().alias("u_leaf_order_mean"),
    ])
)

# ============ GLOBAL L1 ORDER RATE (kategori bazında) ============
global_l1_order = (
    pl.scan_parquet(f"{BASE_PATH}/content/sitewide_log.parquet")
    .select([
        pl.col("content_id_hashed").cast(pl.Categorical),
        pl.col("total_click").cast(pl.Float32),
        pl.col("total_order").cast(pl.Float32),
    ])
    .join(
        pl.scan_parquet(f"{BASE_PATH}/content/metadata.parquet")
          .select([
              pl.col("content_id_hashed").cast(pl.Categorical),
              pl.col("level1_category_name").cast(pl.Categorical),
          ]),
        on="content_id_hashed", how="left"
    )
    .group_by("level1_category_name")
    .agg([
        # oranı sağlam olsun diye sum üzerinden hesaplayalım
        (pl.col("total_order").sum() / (pl.col("total_click").sum() + 1e-6)).alias("global_l1_order_rate"),
        pl.col("total_click").sum().alias("global_l1_click_sum"),  # opsiyonel: ağırlıklar için
    ])
)

# ---------------- LEFT JOIN'LER (tamamı) ----------------
def build_joined(sessions_lazy):
    return (
        sessions_lazy
        .join(content_search_agg, on="content_id_hashed", how="left")
        .join(content_sitewide_agg, on="content_id_hashed", how="left")
        .join(price_rate_review, on=["content_id_hashed", "ts_date"], how="left")
        .join(content_meta, on="content_id_hashed", how="left")
        .join(user_meta, on="user_id_hashed", how="left")
        .join(user_fashion_uc, on=["user_id_hashed", "content_id_hashed"], how="left")
        .join(user_fashion_u, on="user_id_hashed", how="left")
        # NEW joins:
        .join(user_search_u, on="user_id_hashed", how="left")
        .join(user_sitewide_u, on="user_id_hashed", how="left")
        .join(user_top_terms_us, on=["user_id_hashed","search_term_normalized"], how="left")
        .join(content_top_terms_cs, on=["content_id_hashed","search_term_normalized"], how="left")
        .join(term_search_t, on="search_term_normalized", how="left")
        .join(user_fashion_sitewide_uc, on=["user_id_hashed","content_id_hashed"], how="left")
        .join(user_fashion_sitewide_u, on="user_id_hashed", how="left")
        # NEW: user × kategori ve global kategori oranları
        .join(user_cat_l1,  on=["user_id_hashed","level1_category_name"], how="left")
        .join(user_cat_leaf,on=["user_id_hashed","leaf_category_name"],  how="left")
        .join(global_l1_order, on="level1_category_name", how="left")
        .with_columns([
            # Ürün yaşı (gün)
            # (pl.col("ts_date") - pl.col("content_creation_date")).dt.days().alias("item_age_days"),
            # Kullanıcı yaşı (yıl)
            (pl.col("ts_date").dt.year() - pl.col("user_birth_year")).cast(pl.Float32).alias("user_age_years"),
        ])
        .fill_null(-1.0)   # sayısallar -1; (kategorik kolonlarda null yoksa sorun olmaz)
    )

train_joined_lazy = build_joined(train_lazy)
test_joined_lazy  = build_joined(test_lazy)

# ---------------- COLLECT (streaming) → PANDAS ----------------
train_pl = train_joined_lazy.collect(streaming=True)
test_pl  = test_joined_lazy.collect(streaming=True)

train = train_pl.to_pandas()
test  = test_pl.to_pandas()

# ---------------- Pandas tarafında session_elapsed hesaplama ----------------
session_start_map = train.groupby("session_id")["ts_timestamp"].transform("min")
train["session_elapsed_sec"] = train["ts_timestamp"] - session_start_map

session_start_map_test = test.groupby("session_id")["ts_timestamp"].transform("min")
test["session_elapsed_sec"] = test["ts_timestamp"] - session_start_map_test

# ---------------- RAM temizliği ----------------
to_del = [
    "train_pl","test_pl","train_joined_lazy","test_joined_lazy",
    "content_search_agg","content_sitewide_agg","price_rate_review","content_meta",
    "user_meta","user_fashion_uc","user_fashion_u","user_search_u","user_sitewide_u",
    "user_top_terms_us","content_top_terms_cs","term_search_t",
    "user_fashion_sitewide_uc","user_fashion_sitewide_u",
    "train_lazy","test_lazy"
]
for v in to_del:
    if v in locals(): del globals()[v]
gc.collect()

print("Pandas shapes:", train.shape, test.shape)



Pandas shapes: (2773805, 93) (2988697, 91)
CPU times: user 18min 23s, sys: 3min 31s, total: 21min 54s
Wall time: 8min 8s


In [None]:
%%time
if True:

    def reduce_mem_usage(df: pd.DataFrame, use_categorical=True, verbose=True):
        start_mem = df.memory_usage(deep=True).sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtype
            if pd.api.types.is_numeric_dtype(col_type):
                c_min, c_max = df[col].min(), df[col].max()
                if pd.api.types.is_integer_dtype(col_type):
                    for t in (np.int8, np.int16, np.int32):
                        if c_min >= np.iinfo(t).min and c_max <= np.iinfo(t).max:
                            df[col] = df[col].astype(t); break
                else:
                    for t in (np.float16, np.float32):
                        df[col] = df[col].astype(t)
                        # İsterseniz hataya karşı: df[col] = pd.to_numeric(df[col], downcast="float")
            elif use_categorical and df[col].dtype == "object":
                # Sık tekrar eden metinlerde çok etkili
                num_unique = df[col].nunique(dropna=False)
                num_total  = len(df[col])
                if num_unique / num_total < 0.5:
                    df[col] = df[col].astype("category")
        end_mem = df.memory_usage(deep=True).sum() / 1024**2
        if verbose:
            print(f"Mem {start_mem:.1f} MB → {end_mem:.1f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% kazanç)")
        return df

        train = reduce_mem_usage(train)
        test  = reduce_mem_usage(test)        

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 15.5 µs


In [5]:
%%time
def add_basic_features(df: pd.DataFrame) -> pd.DataFrame:
    eps = 1e-6  # sıfıra bölmeyi önlemek için

    # 1) Uygulanan indirim oranı
    df["discount_ratio"] = (df["original_price"] - df["discounted_price"]) / (df["original_price"] + eps)
    df.loc[df["original_price"] <= 0, "discount_ratio"] = 0  # geçersiz fiyatlarda 0

    # 2) Search CTR
    df["search_ctr"] = df["c_search_clk_mean"] / (df["c_search_imp_mean"] + eps)

    # 3) Click-to-Order rate
    df["click_to_order_rate"] = df["c_order_mean"] / (df["c_click_mean"] + eps)

    # 4) Hour of day
    df["hour"] = pd.to_datetime(df["ts_hour"]).dt.hour.astype(np.int8)

    # 5) Day of week
    df["dayofweek"] = pd.to_datetime(df["ts_hour"]).dt.dayofweek.astype(np.int8)

    # 6) Session elapsed sec zaten var, ama minutes versiyonunu ekleyelim
    df["session_elapsed_min"] = (df["session_elapsed_sec"] / 60).astype(np.float32)

    return df

# Train & Test'e uygula
train = add_basic_features(train)
test  = add_basic_features(test)

print("Yeni train shape:", train.shape)
print("Yeni test shape:", test.shape)

Yeni train shape: (2773805, 99)
Yeni test shape: (2988697, 97)
CPU times: user 883 ms, sys: 184 ms, total: 1.07 s
Wall time: 880 ms


In [6]:
%%time
eps = 1e-6

def add_extra_features(df: pd.DataFrame) -> pd.DataFrame:
    # ---------------- USER–ITEM conversion rates ----------------
    df["user_item_click_to_cart_rate"] = df["ufs_uc_cart_mean"] / (df["ufs_uc_click_mean"] + eps)
    df["user_item_cart_to_order_rate"] = df["ufs_uc_order_mean"] / (df["ufs_uc_cart_mean"] + eps)
    df["user_item_click_to_order_rate"] = df["ufs_uc_order_mean"] / (df["ufs_uc_click_mean"] + eps)

    # ---------------- Popularity trends ----------------
    # clicks_last_7d, clicks_last_30d gibi kolonlar yoksa, bunları daha önce aggregate ile eklemen lazım.
    if "clicks_last_7d" in df.columns and "clicks_last_30d" in df.columns:
        df["popularity_ratio_recent"] = df["clicks_last_7d"] / (df["clicks_last_30d"] + eps)
    else:
        df["popularity_ratio_recent"] = -1  # placeholder

    if "orders_last_3d" in df.columns and "orders_last_14d" in df.columns:
        df["order_trend"] = df["orders_last_3d"] / (df["orders_last_14d"] + eps)
    else:
        df["order_trend"] = -1

    # ---------------- Session size ----------------
    session_sizes = df.groupby("session_id")["content_id_hashed"].transform("count")
    df["session_size"] = session_sizes

    # ---------------- Session price difference from average ----------------
    session_avg_price = df.groupby("session_id")["selling_price"].transform("mean")
    df["session_price_diff_from_avg"] = df["selling_price"] - session_avg_price

    # ---------------- Session category match ratio ----------------
    session_cat_ratio = (
        df.groupby("session_id")["leaf_category_name"]
          .transform(lambda x: x.map(x.value_counts(normalize=True)).astype(np.float32))
    )
    df["session_category_match_ratio"] = session_cat_ratio

    # ---------------- Discount rank in category ----------------
    df["discount_ratio"] = (df["original_price"] - df["discounted_price"]) / (df["original_price"] + eps)
    df["discount_rank_in_category"] = (
        df.groupby("leaf_category_name")["discount_ratio"].rank(method="dense", ascending=False)
    )

    # ---------------- Price rank in category ----------------
    df["price_rank_in_category"] = (
        df.groupby("leaf_category_name")["selling_price"].rank(method="dense", ascending=True)
    )

    return df

# Train ve test'e uygula
train = add_extra_features(train)
test = add_extra_features(test)



CPU times: user 46.3 s, sys: 501 ms, total: 46.8 s
Wall time: 46.6 s


In [7]:
%%time
if True:

    eps = 1e-6
    
    def add_user_item_category_term_features(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
    
        # ---------- User–Item geçmiş oranları ----------
        df["u_c_click_rate"] = df["ufs_uc_click_mean"] / (df["u_click_mean"] + eps)
        df["u_c_order_rate"] = df["ufs_uc_order_mean"] / (df["u_order_mean"] + eps)
    
        # ---------- User–Category affinity ----------
        # Kategori içi dönüştürme oranı (user'ın L1 kategorideki order/click oranı)
        df["u_l1_order_rate"] = df["u_l1_order_mean"] / (df["u_l1_click_mean"] + eps)
        # Leaf için click eğilimi (click yoğunluğu)
        df["u_leaf_click_rate"] = df["u_leaf_click_mean"]  # istersen normalize edebilirsin
        # Global L1'e göre lift
        df["affinity_lift"] = df["u_l1_order_rate"] / (df["global_l1_order_rate"] + eps)
    
        # ---------- User–Term affinity ----------
        df["u_tterm_ctr"]   = df["u_tterm_clk_mean"] / (df["u_tterm_imp_mean"] + eps)
        df["t_ctr_global"]  = df["t_clk_mean"] / (df["t_imp_mean"] + eps)
        df["u_tterm_lift"]  = df["u_tterm_ctr"] / (df["t_ctr_global"] + eps)
    
        # ---------- Kalite puanı ----------
        df["rate_weighted"]      = df["content_rate_avg"] * np.log1p(df["content_rate_count"])
        df["media_review_share"] = df["content_review_wth_media_count"] / (df["content_review_count"] + eps)
    
        return df
    
    
    def add_session_relative_features(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
    
        # Robust median bazlı farklar
        g = df.groupby("session_id", sort=False)
    
        # price median farkı
        df["price_diff_from_session_med"] = df["selling_price"] - g["selling_price"].transform("median")
    
        # discount_ratio (gerekirse varsa; yoksa önce hesapla)
        if "discount_ratio" not in df.columns:
            df["discount_ratio"] = (df["original_price"] - df["discounted_price"]) / (df["original_price"] + 1e-6)
            df.loc[df["original_price"] <= 0, "discount_ratio"] = 0.0
    
        # kalite puanı (rate_weighted) yoksa ekle
        if "rate_weighted" not in df.columns:
            df["rate_weighted"] = df["content_rate_avg"] * np.log1p(df["content_rate_count"])
    
        # session içi max’a göre farklar
        df["discount_diff_from_session_max"] = g["discount_ratio"].transform("max") - df["discount_ratio"]
        df["rate_diff_from_session_max"]     = g["rate_weighted"].transform("max") - df["rate_weighted"]
    
        return df
    
    
    # ---- Train & Test'e uygula ----
    train = add_user_item_category_term_features(train)
    test  = add_user_item_category_term_features(test)
    
    train = add_session_relative_features(train)
    test  = add_session_relative_features(test)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


CPU times: user 4.39 s, sys: 3.46 s, total: 7.85 s
Wall time: 7.55 s




In [8]:
# Pandas'a aldıktan sonra (train ve test hazır)
for df in (train, test):
    df["ts_timestamp"] = pd.to_datetime(df["ts_hour"]).astype("int64") // 10**9
    df.drop(columns=["ts_hour"], inplace=True)
    df.drop(columns=["ts_date"], inplace=True)
    # int64 formatında, saniye bazında epoch

In [9]:
%%time
# ---------------- NaN fix for categorical features ----------------
UNK = "unknown"

# Kategorik kolonlar (grup/id'i FE'de kullanmıyoruz)
cat_features = [
    "search_term_normalized",
    # "user_id_hashed",
    "content_id_hashed",
    "level1_category_name",
    "level2_category_name",
    "leaf_category_name",
    "user_gender",
]

def prep_cat_features(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        if c not in df.columns:
            continue
        # önce object'a çevir
        df[c] = df[c].astype("object")
        # NaN ve -1/-1.0 gibi sentinel değerleri unknown yap
        mask_nan = pd.isna(df[c])
        mask_neg = df[c].isin([-1, -1.0, "-1", "-1.0"])
        if mask_nan.any() or mask_neg.any():
            df.loc[mask_nan | mask_neg, c] = UNK
        # string dtype'a sabitle
        df[c] = df[c].astype("string")
    return df

# train/test'i hazırla (CV/fit'ten ÖNCE!)
train = prep_cat_features(train, cat_features)
test  = prep_cat_features(test,  cat_features)

CPU times: user 7.34 s, sys: 1.75 s, total: 9.09 s
Wall time: 9.09 s


In [10]:
%time
if True:
    # Cross feature tanımları
    cross_defs = [
        ("leaf_searchterm", "leaf_category_name", "search_term_normalized"),
        ("l1_searchterm", "level1_category_name", "search_term_normalized"),
        ("l1_hour", "level1_category_name", "hour"),
        ("l2_hour", "level2_category_name", "hour"),
    ]
    
    def add_cross_features_pandas(df, cross_defs):
        for new_col, col1, col2 in cross_defs:
            df[new_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
            df[new_col] = df[new_col].astype("category")
        return df
    
    # Train ve test için çalıştır
    train = add_cross_features_pandas(train, cross_defs)
    test  = add_cross_features_pandas(test, cross_defs)
    
    # cat_features listesine yeni kolonları ekle
    new_cat_cols = [c[0] for c in cross_defs]
    cat_features.extend(new_cat_cols)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs


In [None]:
%%time
if True:
    
    # -------- helpers --------
    def safe_qcut(s: pd.Series, q=10, name="bin", duplicates="drop"):
        """qcut için güvenli sarmalayıcı: NaN'ları korur, tekil dağılımda tek bin döner."""
        s_no_nan = s.dropna()
        if s_no_nan.nunique() <= 1:
            # Tek değer varsa tek bin
            out = pd.Series(index=s.index, dtype="object")
            out.loc[~s.isna()] = f"{name}_ALL"
            return out.astype("category")
        try:
            b = pd.qcut(s_no_nan, q=q, duplicates=duplicates)
        except ValueError:
            # Çok fazla tekrar veya q bölünemiyorsa, unique quantile sayısına düş
            uniq = min(q, max(2, s_no_nan.nunique()))
            b = pd.qcut(s_no_nan, q=uniq, duplicates=duplicates)
        out = pd.Series(index=s.index, dtype="object")
        out.loc[s_no_nan.index] = b.astype(str).values
        return out.astype("category")
    
    def to_cat_str(series, fill_value="unknown"):
        return series.astype("object").fillna(fill_value).astype(str).astype("category")
    
    # -------- 1) Train + Test birleştir --------
    train["is_train"] = 1
    test["is_train"]  = 0
    full_df = pd.concat([train, test], ignore_index=True)
    
    # -------- 2) Binlenecek kolonlar (q=10) --------
    # bin_features = ["discount_ratio", "selling_price", "c_order_mean"]
    bin_features = [
        "discount_ratio",
        "selling_price",
        "c_order_mean",
    ]
    bin_suffixes = []
    
    for col in bin_features:
        bcol = f"{col}_bin"
        full_df[bcol] = safe_qcut(full_df[col], q=10, name=col)
        # qcut dışındaki NaN'ları string kategori yap
        full_df[bcol] = to_cat_str(full_df[bcol])
        bin_suffixes.append(bcol)
    
    # -------- 3) user_gender ekle --------
    # not: orijinal kolonda NaN olabilir → unknown ile doldur
    full_df["user_gender"] = to_cat_str(full_df["user_gender"])
    
    # -------- 4) İkili kombinasyonlar --------
    combo_bases = bin_suffixes + ["user_gender"]  # tüm binned + gender
    
    pair_names = []
    for a, b in combinations(combo_bases, 2):
        cname = f"{a}__x__{b}"
        # string birleştir, NaN yok; kategoriye çevir
        full_df[cname] = (full_df[a].astype(str) + "|" + full_df[b].astype(str)).astype("category")
        pair_names.append(cname)
    
    # -------- 5) Train/Test'e geri ayır --------
    train = full_df[full_df["is_train"] == 1].drop(columns=["is_train"])
    test  = full_df[full_df["is_train"] == 0].drop(columns=["is_train"])
    
    # -------- 6) Config’e ekle --------
    new_cat_features = bin_suffixes + ["user_gender"] + pair_names
    
    for f in new_cat_features:
        if f not in cat_features:
            cat_features.append(f)
    
    print(f"Added categorical features: {len(new_cat_features)} "
          f"(bins={len(bin_suffixes)}, pairs={len(pair_names)} + user_gender)")

Added categorical features: 10 (bins=3, pairs=6 + user_gender)
CPU times: user 36.3 s, sys: 24.4 s, total: 1min
Wall time: 1min


In [12]:
%%time
eps = 1e-6
M_BAYES = 100.0  # smoothing katsayısı (50–300 arası deneyebilirsin)

# ---- Term prior CTR (satır-bazlı) ----
prior_ctr_term = train["t_clk_mean"] / (train["t_imp_mean"] + eps)
prior_ctr_overall = np.nan_to_num((train["t_clk_mean"].sum())/(train["t_imp_mean"].sum()+eps), nan=0.0)
# Geçersiz prior'ları overall ile doldur
def _safe_prior_ctr_term(s):
    s = s.copy()
    bad = ~np.isfinite(s) | (s<=0)
    s[bad] = prior_ctr_overall
    return s

# ---- Global priors (skaler) ----
prior_click2order = (train["c_order_mean"].sum()) / (train["c_click_mean"].sum() + eps)
prior_cart2order  = (train["c_order_mean"].sum()) / (train["c_cart_mean"].sum()  + eps)
prior_fav2order   = (train["c_order_mean"].sum()) / (train["c_fav_mean"].sum()   + eps)

def add_bayesian_features(df):
    df = df.copy()
    # search CTR (ürün search tabanlı) — term prior ile smooth
    prior = _safe_prior_ctr_term(df["t_clk_mean"]/ (df["t_imp_mean"]+eps))
    df["search_ctr_bs"] = (df["c_search_clk_mean"] + M_BAYES*prior) / (df["c_search_imp_mean"] + M_BAYES)

    # click -> order conversion
    df["click_to_order_bs"] = (df["c_order_mean"] + M_BAYES*prior_click2order) / (df["c_click_mean"] + M_BAYES)

    # cart -> order, fav -> order
    df["cart_to_order_bs"] = (df["c_order_mean"] + M_BAYES*prior_cart2order) / (df["c_cart_mean"] + M_BAYES)
    df["fav_to_order_bs"]  = (df["c_order_mean"] + M_BAYES*prior_fav2order)  / (df["c_fav_mean"] + M_BAYES)
    return df

train = add_bayesian_features(train)
test  = add_bayesian_features(test)

CPU times: user 4.24 s, sys: 2.46 s, total: 6.69 s
Wall time: 6.47 s


In [13]:
%%time
if True:
    # Ürün bazında medyan sipariş fiyatı/indirimi (train fit)
    ordered_mask = train["ordered"] == 1
    global_price_med_ordered = train.loc[ordered_mask, "selling_price"].median()
    global_disc_med_ordered  = train.loc[ordered_mask, "discount_ratio"].median()
    
    item_price_med_map = train.loc[ordered_mask].groupby("content_id_hashed")["selling_price"].median()
    item_disc_med_map  = train.loc[ordered_mask].groupby("content_id_hashed")["discount_ratio"].median()
    
    def add_item_price_prefs(df):
        df = df.copy()
        df["item_price_med_ordered"] = df["content_id_hashed"].map(item_price_med_map).fillna(global_price_med_ordered)
        df["item_disc_med_ordered"]  = df["content_id_hashed"].map(item_disc_med_map).fillna(global_disc_med_ordered)
        df["price_gap_to_item_pref"] = df["selling_price"] - df["item_price_med_ordered"]
        df["disc_gap_to_item_pref"]  = df["discount_ratio"] - df["item_disc_med_ordered"]
        return df
    
    train = add_item_price_prefs(train)
    test  = add_item_price_prefs(test)

CPU times: user 5.43 s, sys: 1.07 s, total: 6.5 s
Wall time: 6.38 s


In [14]:
%%time
if True:
    # rate_weighted yoksa ekle
    if "rate_weighted" not in train.columns:
        train["rate_weighted"] = train["content_rate_avg"] * np.log1p(train["content_rate_count"])
        test ["rate_weighted"] = test ["content_rate_avg"] * np.log1p(test ["content_rate_count"])
    
    # Leaf bazında median ve IQR (train'de fit)
    def _iqr(s):
        q75 = s.quantile(0.75)
        q25 = s.quantile(0.25)
        return (q75 - q25)
    
    leaf_grp = train.groupby("leaf_category_name", sort=False)
    leaf_price_med = leaf_grp["selling_price"].median()
    leaf_price_iqr = leaf_grp["selling_price"].apply(_iqr)
    leaf_disc_med  = leaf_grp["discount_ratio"].median()
    leaf_disc_iqr  = leaf_grp["discount_ratio"].apply(_iqr)
    leaf_qual_med  = leaf_grp["rate_weighted"].median()
    leaf_qual_iqr  = leaf_grp["rate_weighted"].apply(_iqr)
    
    # Global yedekler
    g_price_med, g_price_iqr = train["selling_price"].median(), _iqr(train["selling_price"])
    g_disc_med,  g_disc_iqr  = train["discount_ratio"].median(), _iqr(train["discount_ratio"])
    g_qual_med,  g_qual_iqr  = train["rate_weighted"].median(), _iqr(train["rate_weighted"])
    
    def add_category_normalized(df):
        df = df.copy()
        # map ile leaf istatistiklerini çek
        lp_med = df["leaf_category_name"].map(leaf_price_med).fillna(g_price_med)
        lp_iqr = df["leaf_category_name"].map(leaf_price_iqr).fillna(g_price_iqr) + eps
        ld_med = df["leaf_category_name"].map(leaf_disc_med).fillna(g_disc_med)
        ld_iqr = df["leaf_category_name"].map(leaf_disc_iqr).fillna(g_disc_iqr) + eps
        lq_med = df["leaf_category_name"].map(leaf_qual_med).fillna(g_qual_med)
        lq_iqr = df["leaf_category_name"].map(leaf_qual_iqr).fillna(g_qual_iqr) + eps
    
        df["price_z_in_leaf"]   = (df["selling_price"] - lp_med) / lp_iqr
        df["disc_z_in_leaf"]    = (df["discount_ratio"] - ld_med) / ld_iqr
        df["quality_z_in_leaf"] = (df["rate_weighted"]   - lq_med) / lq_iqr
        return df
    
    train = add_category_normalized(train)
    test  = add_category_normalized(test)

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5,
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5,
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a =

CPU times: user 19.5 s, sys: 11.2 s, total: 30.7 s
Wall time: 30.8 s


In [15]:
%%time
if True:
    def add_cv_features(df):
        df = df.copy()
        # Content & user & user-term seviyelerinde örnek CV'ler
        df["c_search_cv"] = df["c_search_imp_std"] / (df["c_search_imp_mean"] + eps)
        df["c_click_cv"]  = df["c_click_mean"].where(df["c_click_mean"]>0, np.nan)  # örnek: click'in std'ı yoksa geç
        # Eğer click std yoksa, sitewide'dan bir std yok; o yüzden sadece search için kullandık.
        df["u_click_cv"]  = df["u_click_std"] / (df["u_click_mean"] + eps)
        df["u_order_cv"]  = df["u_order_std"] / (df["u_order_mean"] + eps)
        df["u_tterm_cv"]  = df["u_tterm_clk_std"] / (df["u_tterm_imp_mean"] + eps)
        # NaN'ları 0'a çek
        for c in ["c_search_cv","u_click_cv","u_order_cv","u_tterm_cv"]:
            df[c] = df[c].replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(np.float32)
        return df
    
    train = add_cv_features(train)
    test  = add_cv_features(test)

CPU times: user 3.07 s, sys: 1.48 s, total: 4.55 s
Wall time: 4.41 s


In [16]:
%%time
if True:

    eps = 1e-6
    
    def _session_minmax(df, col, session_col="session_id"):
        g = df.groupby(session_col, sort=False)
        vmin = g[col].transform("min")
        vmax = g[col].transform("max")
        return ((df[col] - vmin) / (vmax - vmin + eps)).astype("float32")
    
    def add_session_rank_and_norm(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        if "rate_weighted" not in df.columns:
            df["rate_weighted"] = df["content_rate_avg"] * np.log1p(df["content_rate_count"])
    
        g = df.groupby("session_id", sort=False)
        df["price_rank_in_session"]    = g["selling_price"].rank(method="dense", ascending=True).astype("int16")
        df["discount_rank_in_session"] = g["discount_ratio"].rank(method="dense", ascending=False).astype("int16")
        df["quality_rank_in_session"]  = g["rate_weighted"].rank(method="dense", ascending=False).astype("int16")
    
        df["price_norm_in_session"]    = _session_minmax(df, "selling_price")
        df["discount_norm_in_session"] = _session_minmax(df, "discount_ratio")
        df["quality_norm_in_session"]  = _session_minmax(df, "rate_weighted")
        return df
    
    train = add_session_rank_and_norm(train)
    test  = add_session_rank_and_norm(test)

CPU times: user 8.03 s, sys: 1.95 s, total: 9.98 s
Wall time: 9.83 s


In [17]:
%%time
if True:
    M_PREF = 50.0  # smoothing
    eps = 1e-6
    
    def add_user_l1_pref_ratio(df):
        df = df.copy()
        # Kullanıcı L1 dönüşüm eğilimi: order/click (loglardan geldiği için güvenli)
        # Not: 0 bölünme için güvence
        u_l1_order_rate = df["u_l1_order_mean"] / (df["u_l1_click_mean"] + eps)
    
        # Kullanıcının genel order yoğunluğu (user bazında)
        u_order_rate_all = df["u_order_mean"]  # zaten user/sitewide_log'tan mean(order)
    
        # Prior: global L1 dönüşümü (content/sitewide_log + metadata’dan hesaplamıştın)
        prior_l1 = df.get("global_l1_order_rate", pd.Series(0.1, index=df.index, dtype="float32"))
    
        # Bayes harman: (kullanıcı L1 rate + prior) / (1 + 1) yerine,
        # kullanıcı genel yoğunluğu ile prior’u M_PREF ile smooth edelim
        # (u_l1_order_rate * (u_order_rate_all weight)) ~ sezgisel; strictly prob değil ama pratikte iyi çalışır.
        # Güvenli bir form:
        # ( u_l1_order_rate * M_PREF + prior_l1 ) / (M_PREF + 1)
        df["category_vs_user_pref_ratio"] = (
            (u_l1_order_rate.fillna(0).astype("float32") * M_PREF + prior_l1.astype("float32"))
            / (M_PREF + 1.0)
        ).astype("float32")
    
        return df
    
    train = add_user_l1_pref_ratio(train)
    test  = add_user_l1_pref_ratio(test)

CPU times: user 2.94 s, sys: 1.69 s, total: 4.62 s
Wall time: 4.52 s


In [18]:
%%time
if True:
    def add_user_price_discount_proxies(df):
        df = df.copy()
        # price_z_in_leaf ve disc_z_in_leaf’i önceki bloklarında üretmiştin
        for need in ["price_z_in_leaf", "disc_z_in_leaf", "category_vs_user_pref_ratio"]:
            if need not in df.columns:
                df[need] = 0.0  # güvenli fallback
    
        df["price_vs_user_pref_proxy"]    = (df["price_z_in_leaf"] / (1.0 + df["category_vs_user_pref_ratio"])).astype("float32")
        df["discount_vs_user_pref_proxy"] = (df["disc_z_in_leaf"]  / (1.0 + df["category_vs_user_pref_ratio"])).astype("float32")
        return df
    
    train = add_user_price_discount_proxies(train)
    test  = add_user_price_discount_proxies(test)


CPU times: user 2.85 s, sys: 1.92 s, total: 4.77 s
Wall time: 4.71 s


In [19]:
%%time
if True:    
    eps = 1e-6
    
    def add_session_diversity_features(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        g = df.groupby("session_id", sort=False)
    
        # Aynı kategoride kaç aday var? ve payı
        df["same_leaf_count_in_session"] = g["leaf_category_name"].transform(lambda s: s.map(s.value_counts())).astype("int16")
        df["same_l1_count_in_session"]   = g["level1_category_name"].transform(lambda s: s.map(s.value_counts())).astype("int16")
        df["same_leaf_share_in_session"] = (df["same_leaf_count_in_session"] / (df["session_size"] + eps)).astype("float32")
        df["same_l1_share_in_session"]   = (df["same_l1_count_in_session"]   / (df["session_size"] + eps)).astype("float32")
    
        # Entropy (leaf bazında)
        def _entropy(arr):
            vc = arr.value_counts(normalize=True)
            return float(-(vc * np.log(vc + eps)).sum())
    
        ent = g["leaf_category_name"].transform(_entropy).astype("float32")
        df["session_category_entropy"] = ent
    
        # Fiyat aralığı (IQR)
        q75 = g["selling_price"].transform(lambda s: s.quantile(0.75))
        q25 = g["selling_price"].transform(lambda s: s.quantile(0.25))
        df["session_price_iqr"] = (q75 - q25).astype("float32")
    
        return df
    
    train = add_session_diversity_features(train)
    test  = add_session_diversity_features(test)

CPU times: user 2min 49s, sys: 2.38 s, total: 2min 52s
Wall time: 2min 51s


In [20]:
%%time
if True:
    def add_term_content_lifts(df):
        df = df.copy()
        tc_ctr = df["c_tterm_clk_mean"] / (df["c_tterm_imp_mean"] + eps)
        t_ctr  = df["t_clk_mean"]       / (df["t_imp_mean"] + eps)
        c_ctr  = df["c_search_clk_mean"]/ (df["c_search_imp_mean"] + eps)
    
        df["tc_term_ctr"]      = tc_ctr.replace([np.inf, -np.inf], np.nan).fillna(0).astype("float32")
        df["tc_lift_vs_term"]  = (tc_ctr / (t_ctr + eps)).replace([np.inf, -np.inf], np.nan).fillna(0).astype("float32")
        df["tc_lift_vs_content"]= (tc_ctr / (c_ctr + eps)).replace([np.inf, -np.inf], np.nan).fillna(0).astype("float32")
        return df
    
    train = add_term_content_lifts(train)
    test  = add_term_content_lifts(test)

CPU times: user 3.58 s, sys: 2.48 s, total: 6.06 s
Wall time: 5.87 s


In [21]:
%%time
if True:
    def add_l1_norms(df):
        df = df.copy()
        if "rate_weighted" not in df.columns:
            df["rate_weighted"] = df["content_rate_avg"] * np.log1p(df["content_rate_count"])
        g = df.groupby("level1_category_name", sort=False)
        med_p = g["selling_price"].transform("median")
        iqr_p = (g["selling_price"].transform(lambda s: s.quantile(0.75)) - g["selling_price"].transform(lambda s: s.quantile(0.25))) + eps
        med_d = g["discount_ratio"].transform("median")
        iqr_d = (g["discount_ratio"].transform(lambda s: s.quantile(0.75)) - g["discount_ratio"].transform(lambda s: s.quantile(0.25))) + eps
        med_q = g["rate_weighted"].transform("median")
        iqr_q = (g["rate_weighted"].transform(lambda s: s.quantile(0.75)) - g["rate_weighted"].transform(lambda s: s.quantile(0.25))) + eps
    
        df["price_z_in_l1"]   = ((df["selling_price"] - med_p) / iqr_p).astype("float32")
        df["disc_z_in_l1"]    = ((df["discount_ratio"] - med_d) / iqr_d).astype("float32")
        df["quality_z_in_l1"] = ((df["rate_weighted"] - med_q) / iqr_q).astype("float32")
        return df
    
    train = add_l1_norms(train)
    test  = add_l1_norms(test)

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


CPU times: user 7.65 s, sys: 2.86 s, total: 10.5 s
Wall time: 10.3 s


In [22]:
%%time
if True:
    def add_merchant_attribute_signals(df):
        df = df.copy()
        df["scarcity_index"] = (1.0 / (1.0 + df["merchant_count"].fillna(0))).astype("float32")
    
        g = df.groupby("leaf_category_name", sort=False)
        for col, out in [
            ("merchant_count","merchant_z_in_leaf"),
            ("attribute_type_count","attr_type_z_in_leaf"),
            ("filterable_label_count","filterable_z_in_leaf"),
        ]:
            med = g[col].transform("median")
            iqr = (g[col].transform(lambda s: s.quantile(0.75)) - g[col].transform(lambda s: s.quantile(0.25))) + eps
            df[out] = ((df[col] - med) / iqr).astype("float32")
        return df
    
    train = add_merchant_attribute_signals(train)
    test  = add_merchant_attribute_signals(test)

CPU times: user 13.7 s, sys: 2.63 s, total: 16.3 s
Wall time: 16 s


In [23]:
%%time
if True:
    def add_session_minmax(df, cols, session_col="session_id"):
        g = df.groupby(session_col, sort=False)
        for c in cols:
            if c not in df.columns: 
                continue
            vmin = g[c].transform("min")
            vmax = g[c].transform("max")
            df[f"{c}_sess_minmax"] = ((df[c] - vmin) / (vmax - vmin + eps)).astype("float32")
        return df
    
    target_cols = [
        "search_ctr_bs",
        "u_search_imp_std",
        "c_tterm_clk_mean",
        "u_tterm_lift",
        "click_to_order_rate",
        "c_tterm_clk_std",
        "search_ctr",
        "price_diff_from_session_med",
        "u_search_imp_mean",
    ]
    
    train = add_session_minmax(train, target_cols)
    test  = add_session_minmax(test,  target_cols)

CPU times: user 1.8 s, sys: 18.1 ms, total: 1.82 s
Wall time: 1.37 s


In [24]:
if True:
    train = reduce_mem_usage(train)
    test  = reduce_mem_usage(test)

  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)


Mem 4965.5 MB → 2866.7 MB (42.3% kazanç)


  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)


Mem 5357.1 MB → 3082.3 MB (42.5% kazanç)


In [25]:
train.shape, test.shape

((2773805, 183), (2988697, 183))

In [26]:
# user_id_hashed tamamen drop
if "user_id_hashed" in train.columns:
    train = train.drop(columns=["user_id_hashed"])
if "user_id_hashed" in test.columns:
    test = test.drop(columns=["user_id_hashed"])

# cat_features listesinden de çıkar
cat_features = [c for c in cat_features if c != "user_id_hashed"]

In [27]:
cat_features

['search_term_normalized',
 'content_id_hashed',
 'level1_category_name',
 'level2_category_name',
 'leaf_category_name',
 'user_gender',
 'leaf_searchterm',
 'l1_searchterm',
 'l1_hour',
 'l2_hour',
 'discount_ratio_bin',
 'selling_price_bin',
 'c_order_mean_bin',
 'discount_ratio_bin__x__selling_price_bin',
 'discount_ratio_bin__x__c_order_mean_bin',
 'discount_ratio_bin__x__user_gender',
 'selling_price_bin__x__c_order_mean_bin',
 'selling_price_bin__x__user_gender',
 'c_order_mean_bin__x__user_gender']

In [None]:
# --- Build feature lists (train/test aligned) ---

LABEL_COLS = ["clicked", "ordered"]          # only in train
GROUP_COLS = ["session_id"]
TIME_COLS  = ["ts_timestamp"]
EXCLUDE    = set(LABEL_COLS + GROUP_COLS + TIME_COLS + [])  # add more if needed

# columns that exist in BOTH train & test
common_cols = [c for c in train.columns if c in test.columns and c not in EXCLUDE]

# categorical dtypes
def is_cat(s: pd.Series) -> bool:
    return (
        pd.api.types.is_categorical_dtype(s) or
        pd.api.types.is_object_dtype(s) or
        str(s.dtype) == "string"
    )

# auto-detect cats; merge with any existing cat_features you built earlier
cat_features_auto = [c for c in common_cols if is_cat(train[c])]
if "cat_features" in globals():
    cat_features = sorted({c for c in (cat_features_auto + cat_features) if c in common_cols})
else:
    cat_features = sorted(cat_features_auto)

# numerics = remaining numeric columns
num_features = sorted([c for c in common_cols if c not in cat_features and pd.api.types.is_numeric_dtype(train[c])])

print(f"#num_features={len(num_features)}  #cat_features={len(cat_features)}")

#num_features=157  #cat_features=20


  pd.api.types.is_categorical_dtype(s) or


In [29]:
num_features

['affinity_lift',
 'attr_option_per_type',
 'attr_type_z_in_leaf',
 'attribute_type_count',
 'c_cart_mean',
 'c_click_cv',
 'c_click_mean',
 'c_fav_mean',
 'c_order_mean',
 'c_search_clk_mean',
 'c_search_clk_std',
 'c_search_cv',
 'c_search_imp_mean',
 'c_search_imp_std',
 'c_tterm_clk_mean',
 'c_tterm_clk_mean_sess_minmax',
 'c_tterm_clk_std',
 'c_tterm_clk_std_sess_minmax',
 'c_tterm_imp_mean',
 'c_tterm_imp_std',
 'cart_to_order_bs',
 'category_vs_user_pref_ratio',
 'click_to_order_bs',
 'click_to_order_rate',
 'click_to_order_rate_sess_minmax',
 'content_rate_avg',
 'content_rate_count',
 'content_review_count',
 'content_review_wth_media_count',
 'dayofweek',
 'disc_gap_to_item_pref',
 'disc_z_in_l1',
 'disc_z_in_leaf',
 'discount_diff_from_session_max',
 'discount_norm_in_session',
 'discount_rank_in_category',
 'discount_rank_in_session',
 'discount_ratio',
 'discount_vs_user_pref_proxy',
 'discounted_price',
 'fav_to_order_bs',
 'filterable_label_count',
 'filterable_z_in_leaf

In [30]:
cat_features

['c_order_mean_bin',
 'c_order_mean_bin__x__user_gender',
 'content_id_hashed',
 'cv_tags',
 'discount_ratio_bin',
 'discount_ratio_bin__x__c_order_mean_bin',
 'discount_ratio_bin__x__selling_price_bin',
 'discount_ratio_bin__x__user_gender',
 'l1_hour',
 'l1_searchterm',
 'l2_hour',
 'leaf_category_name',
 'leaf_searchterm',
 'level1_category_name',
 'level2_category_name',
 'search_term_normalized',
 'selling_price_bin',
 'selling_price_bin__x__c_order_mean_bin',
 'selling_price_bin__x__user_gender',
 'user_gender']

In [31]:
# Parquet hızlı, sıkıştırma default snappy
train.to_parquet("/kaggle/working/train_fe_180.parquet", index=False)
test.to_parquet("/kaggle/working/test_fe_180.parquet", index=False)