# Data Preparation for Teknofest Trendyol E-Commerce-Hackathon

**Step 1 : Create some base features by using raw data**

In [None]:
# Step 1
from __future__ import annotations

import argparse
from pathlib import Path
from typing import List, Optional, Tuple

import pandas as pd

NUM_THREADS = 4
pd.options.mode.copy_on_write = True


def _print(msg: str):
    print(f"[buid_dataset] {msg}")


def read_parquet(path: Path, columns: Optional[List[str]] = None, fast_limit: Optional[int] = None) -> pd.DataFrame:
    try:
        df = pd.read_parquet(path, engine="pyarrow", columns=columns)
    except Exception:
        df = pd.read_parquet(path, columns=columns)
    if fast_limit is not None and len(df) > fast_limit:
        return df.iloc[: fast_limit].copy()
    return df


def agg_sum_rate(df: pd.DataFrame, num_col: str, den_col: str, out_rate: str, alpha: float = 1.0) -> pd.DataFrame:
    out = df.copy()
    out[out_rate] = (out[num_col] + alpha) / (out[den_col] + alpha)
    return out


def compute_content_static(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "content" / "metadata.parquet"
    use_cols = [
        "content_id_hashed",
        "leaf_category_name",
        "merchant_count",
        "attribute_type_count",
        "total_attribute_option_count",
        "filterable_label_count",
    ]
    df = read_parquet(p, columns=use_cols)
    for c in ["merchant_count", "attribute_type_count", "total_attribute_option_count", "filterable_label_count"]:
        if c in df:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    if "leaf_category_name" in df:
        df["leaf_category_name"] = df["leaf_category_name"].astype("string").fillna("UNKNOWN")
    return df.drop_duplicates("content_id_hashed")


# ------------------------- Time-aware aggregates ------------------------------

def _ensure_ts_hour(df: pd.DataFrame, col: str) -> pd.DataFrame:
    df = df.copy()
    ts = pd.to_datetime(df[col], utc=True, errors="coerce").dt.floor("h")
    df[col] = ts
    # integer nanoseconds since epoch, stable key for asof joins
    df["ts_hour_ns"] = ts.astype("int64")
    return df


def compute_content_price_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "content" / "price_rate_review_data.parquet"
    use_cols = [
        "content_id_hashed",
        "update_date",
        "selling_price",
        "content_review_count",
        "content_rate_count",
        "content_rate_avg",
    ]
    df = read_parquet(p, columns=use_cols, fast_limit=400_000 if fast else None)
    df = df.rename(columns={"update_date": "ts_hour"})
    df = _ensure_ts_hour(df, "ts_hour")
    for c in ["selling_price", "content_review_count", "content_rate_count", "content_rate_avg"]:
        if c in df:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    # Keep all rows for as-of merge (last known snapshot before ts_hour)
    df = df.sort_values(["content_id_hashed", "ts_hour"])  # needed for merge_asof
    return df


def compute_content_sitewide_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "content" / "sitewide_log.parquet"
    use_cols = ["content_id_hashed", "date", "total_click", "total_cart", "total_fav", "total_order"]
    df = read_parquet(p, columns=use_cols, fast_limit=2_000_000 if fast else None)
    df = df.rename(columns={"date": "ts_hour"})
    df = _ensure_ts_hour(df, "ts_hour")
    # Aggregate per key-time then cumulative per key
    g = df.groupby(["content_id_hashed", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["content_id_hashed", "ts_hour_ns"])  # for cumsum/asof
    for col in ["total_click", "total_cart", "total_fav", "total_order"]:
        g[f"cum_{col}"] = g.groupby("content_id_hashed")[col].cumsum()
    # Derive rates from cumulative sums
    out = g[["content_id_hashed", "ts_hour", "ts_hour_ns", "cum_total_click", "cum_total_cart", "cum_total_fav", "cum_total_order"]].copy()
    out = agg_sum_rate(out, "cum_total_order", "cum_total_click", "content_sitewide_avg_order_rate", alpha=1.0)
    out = agg_sum_rate(out, "cum_total_cart", "cum_total_click", "content_sitewide_avg_cart_rate", alpha=1.0)
    out = agg_sum_rate(out, "cum_total_fav", "cum_total_click", "content_sitewide_avg_fav_rate", alpha=1.0)
    # Rename cumulative totals with prefixes
    out = out.rename(
        columns={
            "cum_total_click": "content_sw_total_click",
            "cum_total_cart": "content_sw_total_cart",
            "cum_total_fav": "content_sw_total_fav",
            "cum_total_order": "content_sw_total_order",
        }
    )
    return out


def compute_content_search_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "content" / "search_log.parquet"
    use_cols = ["content_id_hashed", "date", "total_search_impression", "total_search_click"]
    df = read_parquet(p, columns=use_cols, fast_limit=2_000_000 if fast else None)
    df = df.rename(columns={"date": "ts_hour"})
    df = _ensure_ts_hour(df, "ts_hour")
    g = df.groupby(["content_id_hashed", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["content_id_hashed", "ts_hour_ns"])  # for cumsum/asof
    for col in ["total_search_impression", "total_search_click"]:
        g[f"cum_{col}"] = g.groupby("content_id_hashed")[col].cumsum()
    out = g[["content_id_hashed", "ts_hour", "ts_hour_ns", "cum_total_search_impression", "cum_total_search_click"]].copy()
    out = out.rename(
        columns={
            "cum_total_search_impression": "content_search_total_impression",
            "cum_total_search_click": "content_search_total_click",
        }
    )
    out = agg_sum_rate(out, "content_search_total_click", "content_search_total_impression", "content_search_ctr", alpha=1.0)
    return out


def compute_term_content_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "content" / "top_terms_log.parquet"
    use_cols = ["content_id_hashed", "search_term_normalized", "date", "total_search_impression", "total_search_click"]
    df = read_parquet(p, columns=use_cols, fast_limit=2_000_000 if fast else None)
    df = df.rename(columns={"date": "ts_hour"})
    df = _ensure_ts_hour(df, "ts_hour")
    df["search_term_normalized"] = df["search_term_normalized"].astype("string")
    g = df.groupby(["content_id_hashed", "search_term_normalized", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["content_id_hashed", "search_term_normalized", "ts_hour_ns"])  # for cumsum/asof
    g["cum_impr"] = g.groupby(["content_id_hashed", "search_term_normalized"]) ["total_search_impression"].cumsum()
    g["cum_click"] = g.groupby(["content_id_hashed", "search_term_normalized"]) ["total_search_click"].cumsum()
    out = g[["content_id_hashed", "search_term_normalized", "ts_hour", "ts_hour_ns", "cum_impr", "cum_click"]].copy()
    out = out.rename(columns={"cum_impr": "term_content_total_impression", "cum_click": "term_content_total_click"})
    out = agg_sum_rate(out, "term_content_total_click", "term_content_total_impression", "term_content_search_ctr", alpha=1.0)
    return out


def compute_user_metadata(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "user" / "metadata.parquet"
    use_cols = ["user_id_hashed", "user_gender", "user_birth_year", "user_tenure_in_days"]
    df = read_parquet(p, columns=use_cols)
    df["user_gender"] = df["user_gender"].astype("string").fillna("UNKNOWN")
    df["user_birth_year"] = pd.to_numeric(df["user_birth_year"], errors="coerce")
    df["user_tenure_in_days"] = pd.to_numeric(df["user_tenure_in_days"], errors="coerce")
    return df.drop_duplicates("user_id_hashed")


def compute_user_sitewide_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "user" / "sitewide_log.parquet"
    use_cols = ["user_id_hashed", "ts_hour", "total_click", "total_cart", "total_fav", "total_order"]
    df = read_parquet(p, columns=use_cols, fast_limit=1_000_000 if fast else None)
    df = _ensure_ts_hour(df, "ts_hour")
    g = df.groupby(["user_id_hashed", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["user_id_hashed", "ts_hour_ns"])  # for cumsum/asof
    for col in ["total_click", "total_cart", "total_fav", "total_order"]:
        g[f"cum_{col}"] = g.groupby("user_id_hashed")[col].cumsum()
    out = g[["user_id_hashed", "ts_hour", "ts_hour_ns", "cum_total_click", "cum_total_cart", "cum_total_fav", "cum_total_order"]].copy()
    out = agg_sum_rate(out, "cum_total_order", "cum_total_click", "user_sitewide_avg_order_rate", alpha=1.0)
    out = agg_sum_rate(out, "cum_total_cart", "cum_total_click", "user_sitewide_avg_cart_rate", alpha=1.0)
    out = agg_sum_rate(out, "cum_total_fav", "cum_total_click", "user_sitewide_avg_fav_rate", alpha=1.0)
    out = out.rename(
        columns={
            "cum_total_click": "user_sw_total_click",
            "cum_total_cart": "user_sw_total_cart",
            "cum_total_fav": "user_sw_total_fav",
            "cum_total_order": "user_sw_total_order",
        }
    )
    return out


def compute_user_search_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "user" / "search_log.parquet"
    use_cols = ["user_id_hashed", "ts_hour", "total_search_impression", "total_search_click"]
    df = read_parquet(p, columns=use_cols, fast_limit=500_000 if fast else None)
    df = _ensure_ts_hour(df, "ts_hour")
    g = df.groupby(["user_id_hashed", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["user_id_hashed", "ts_hour_ns"])  # for cumsum/asof
    g["cum_impr"] = g.groupby("user_id_hashed")["total_search_impression"].cumsum()
    g["cum_click"] = g.groupby("user_id_hashed")["total_search_click"].cumsum()
    out = g[["user_id_hashed", "ts_hour", "ts_hour_ns", "cum_impr", "cum_click"]].copy()
    out = out.rename(columns={"cum_impr": "user_search_total_impression", "cum_click": "user_search_total_click"})
    out = agg_sum_rate(out, "user_search_total_click", "user_search_total_impression", "user_search_ctr", alpha=1.0)
    return out


def compute_term_search_timeaware(root: Path, fast: bool = False) -> pd.DataFrame:
    p = root / "data" / "term" / "search_log.parquet"
    use_cols = ["search_term_normalized", "ts_hour", "total_search_impression", "total_search_click"]
    df = read_parquet(p, columns=use_cols, fast_limit=400_000 if fast else None)
    df = _ensure_ts_hour(df, "ts_hour")
    df["search_term_normalized"] = df["search_term_normalized"].astype("string")
    g = df.groupby(["search_term_normalized", "ts_hour", "ts_hour_ns"], as_index=False).sum(numeric_only=True)
    g = g.sort_values(["search_term_normalized", "ts_hour_ns"])  # for cumsum/asof
    g["cum_impr"] = g.groupby("search_term_normalized")["total_search_impression"].cumsum()
    g["cum_click"] = g.groupby("search_term_normalized")["total_search_click"].cumsum()
    out = g[["search_term_normalized", "ts_hour", "ts_hour_ns", "cum_impr", "cum_click"]].copy()
    out = out.rename(columns={"cum_impr": "term_search_total_impression", "cum_click": "term_search_total_click"})
    out = agg_sum_rate(out, "term_search_total_click", "term_search_total_impression", "term_search_ctr", alpha=1.0)
    return out


# ----------------------------- Sessions load ----------------------------------

def load_sessions(root: Path, train: bool, debug_n: Optional[int] = None) -> pd.DataFrame:
    p = root / "data" / ("train_sessions.parquet" if train else "test_sessions.parquet")
    cols = [
        "ts_hour",
        "search_term_normalized",
        "user_id_hashed",
        "content_id_hashed",
        "session_id",
    ]
    if train:
        cols += ["clicked", "ordered"]
    df = read_parquet(p, columns=cols)
    if debug_n is not None and len(df) > debug_n:
        first_sessions = df["session_id"].drop_duplicates().iloc[: max(1, debug_n // 50)]
        df = df[df["session_id"].isin(first_sessions)].copy()
    df["ts_hour"] = pd.to_datetime(df["ts_hour"], utc=True, errors="coerce").dt.floor("h")
    df["search_term_normalized"] = df["search_term_normalized"].astype("string")
    df["hour_of_day"] = df["ts_hour"].dt.hour
    df["day_of_week"] = df["ts_hour"].dt.dayofweek
    return df


def build_enriched(root: Path, outdir: Path, debug_n: Optional[int] = None, fast: bool = False) -> Tuple[Path, Path, List[str]]:
    outdir.mkdir(parents=True, exist_ok=True)

    _print("Loading sessions…")
    train_df = load_sessions(root, train=True, debug_n=debug_n)
    test_df = load_sessions(root, train=False, debug_n=debug_n)
    base_cols = [
        "session_id",
        "ts_hour",
        "search_term_normalized",
        "user_id_hashed",
        "content_id_hashed",
        "hour_of_day",
        "day_of_week",
    ]

    _print("Computing time-aware aggregates… (fast=%s)" % fast)
    content_static = compute_content_static(root, fast)
    content_price_t = compute_content_price_timeaware(root, fast)
    content_sw_t = compute_content_sitewide_timeaware(root, fast)
    content_search_t = compute_content_search_timeaware(root, fast)
    term_content_t = compute_term_content_timeaware(root, fast)
    user_meta = compute_user_metadata(root, fast)
    user_sw_t = compute_user_sitewide_timeaware(root, fast)
    user_search_t = compute_user_search_timeaware(root, fast)
    term_search_t = compute_term_search_timeaware(root, fast)

    def join_all(df: pd.DataFrame) -> pd.DataFrame:
        # Keep only clicked/ordered as targets in train; never include added_to_cart/fav
        keep = base_cols + (["clicked", "ordered"] if "clicked" in df.columns else [])
        X = df[keep].copy()
        # Internal numeric hour key for robust as-of merges (not written to output)
        X["ts_hour_ns"] = X["ts_hour"].astype("int64")
        # Regular equals joins (static)
        X = X.merge(content_static, on="content_id_hashed", how="left")
        X = X.merge(user_meta, on="user_id_hashed", how="left")
        # As-of joins (time-aware, no leakage)
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            content_price_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="content_id_hashed",
            on="ts_hour_ns",
            direction="backward",
        )
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            content_sw_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="content_id_hashed",
            on="ts_hour_ns",
            direction="backward",
        )
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            content_search_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="content_id_hashed",
            on="ts_hour_ns",
            direction="backward",
        )
        # content-term
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            term_content_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by=["content_id_hashed", "search_term_normalized"],
            on="ts_hour_ns",
            direction="backward",
        )
        # user side
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            user_sw_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="user_id_hashed",
            on="ts_hour_ns",
            direction="backward",
        )
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            user_search_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="user_id_hashed",
            on="ts_hour_ns",
            direction="backward",
        )
        # term side
        X = pd.merge_asof(
            X.sort_values(["ts_hour_ns"]),
            term_search_t.drop(columns=["ts_hour"], errors="ignore").sort_values(["ts_hour_ns"]),
            by="search_term_normalized",
            on="ts_hour_ns",
            direction="backward",
        )
        # Fill NA
        num_cols = X.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns.tolist()
        cat_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
        X[num_cols] = X[num_cols].fillna(0)
        for c in cat_cols:
            X[c] = X[c].astype("string").fillna("UNKNOWN")
        # Drop internal merge key from final feature set
        if "ts_hour_ns" in X.columns:
            X = X.drop(columns=["ts_hour_ns"])
        return X

    _print("Joining train…")
    train_X = join_all(train_df)
    _print(f"Train shape: {train_X.shape}")

    _print("Joining test…")
    test_X = join_all(test_df)
    _print(f"Test shape: {test_X.shape}")

    cat_features: List[str] = []
    for c, dt in train_X.dtypes.items():
        if dt == "string[python]" or str(dt).startswith("string"):
            if c in {"user_id_hashed", "content_id_hashed", "leaf_category_name", "user_gender", "search_term_normalized"}:
                cat_features.append(str(c))

    train_out = outdir / "train_data_v2.parquet"
    test_out = outdir / "test_data_v2.parquet"
    _print(f"Writing: {train_out}")
    train_X.to_parquet(train_out, index=False)
    _print(f"Writing: {test_out}")
    test_X.to_parquet(test_out, index=False)

    import json

    (outdir / "cat_features.json").write_text(json.dumps(cat_features, ensure_ascii=False, indent=2), encoding="utf-8")
    _print(f"Saved cat_features: {cat_features}")
    return train_out, test_out, cat_features


def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Build enriched datasets from sessions and auxiliary logs.")
    p.add_argument("--outdir", type=str, default=str(Path("CatBoost").resolve()), help="Output directory for enriched datasets")
    p.add_argument("--debug-n", type=int, default=None, help="Limit to approximately N rows (keeps whole sessions)")
    p.add_argument("--fast", action="store_true", help="Use truncated aggregates for a quick dry run")
    return p.parse_args(argv)


def main(argv: Optional[List[str]] = None) -> None:
    args = parse_args(argv)
    root = Path("C:/Projects/trendyol")
    outdir = Path(args.outdir)
    _print(f"Root: {root}")
    _print(f"Outdir: {outdir}")
    _print("Done.")


if __name__ == "__main__":
    main()


**Step 2 : Create Session Stats, Price Normalization, User-Term Affinity**

In [None]:
# Step 2
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl

from typing import Tuple

def time_split(df_pl: pl.DataFrame, val_frac: float = 0.15) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if "ts_hour" in df_pl.columns and df_pl["ts_hour"].dtype != pl.Datetime:
        df_pl = df_pl.with_columns(pl.col("ts_hour").str.to_datetime())
    df_pl = df_pl.sort("ts_hour")
    i = int(len(df_pl) * (1 - val_frac))
    return df_pl[:i].to_pandas(), df_pl[i:].to_pandas()

def _print(msg: str):
    print(f"[v2_plus] {msg}")


# Minimal, safe enrichments: session stats, price-by-category z, time-decayed proxies
# Mirrors selected blocks from auto_feature_search_v2.py (sanitized)

def fb_session_pos_extras(df: pd.DataFrame) -> pd.DataFrame:
    """Session stats extras (leakage-safe): only session-level price stats.
    Avoids positional flags like first/last or cumcount indices that can encode labels
    depending on upstream row ordering.
    """
    if "session_id" in df and "selling_price" in df:
        g = df.groupby("session_id")["selling_price"]
        df["sess_price_mean"] = g.transform("mean")
        df["sess_price_std"] = g.transform("std").fillna(0.0)
        df["sess_price_z"] = (pd.to_numeric(df["selling_price"], errors="coerce").fillna(0.0) - df["sess_price_mean"]) / (df["sess_price_std"].replace(0, np.nan))
        df["sess_price_z"] = df["sess_price_z"].replace([np.inf, -np.inf], 0.0).fillna(0.0)
    # ensure sess_n_items present via safe count
    if "session_id" in df and "sess_n_items" not in df:
        df["sess_n_items"] = df.groupby("session_id")["content_id_hashed"].transform("size")
    return df


def fb_price_norm_by_category_fit(train_df: pd.DataFrame):
    state = {}
    if "leaf_category_name" in train_df and "selling_price" in train_df:
        g = train_df.groupby("leaf_category_name")["selling_price"].agg(["mean", "std"]).rename(columns={"mean": "mu", "std": "sigma"})
        state["cat_price_stats"] = g
    return state


def fb_price_norm_by_category_transform(df: pd.DataFrame, state):
    if "cat_price_stats" in state and "leaf_category_name" in df and "selling_price" in df:
        g = state["cat_price_stats"]
        df = df.merge(g, left_on="leaf_category_name", right_index=True, how="left")
        df["price_z_cat"] = (df["selling_price"] - df["mu"]) / (df["sigma"].replace(0, np.nan))
        df["price_z_cat"] = df["price_z_cat"].replace([np.inf, -np.inf], np.nan).fillna(0.0)
        df.drop(columns=["mu", "sigma"], inplace=True)
    return df


def fb_time_decayed_pop(df: pd.DataFrame) -> pd.DataFrame:
    if "content_sitewide_avg_click_rate" in df and "content_sitewide_avg_order_rate" in df:
        r = (pd.to_numeric(df["content_sitewide_avg_click_rate"], errors="coerce").fillna(0.0) + 1e-6) / (
            pd.to_numeric(df["content_sitewide_avg_order_rate"], errors="coerce").fillna(0.0) + 1e-6
        )
        df["content_pop_ratio_click_order"] = r
    if "content_search_ctr" in df:
        c = pd.to_numeric(df["content_search_ctr"], errors="coerce").fillna(0.0).clip(lower=0)
        df["content_search_ctr_sqrt"] = c.pow(0.5)
        df["content_search_ctr_log1p"] = np.log1p(c)
    if "term_content_search_ctr" in df and "content_search_ctr" in df:
        a = pd.to_numeric(df["term_content_search_ctr"], errors="coerce").fillna(0.0) + 1e-6
        b = pd.to_numeric(df["content_search_ctr"], errors="coerce").fillna(0.0) + 1e-6
        df["term_over_content_ctr"] = a / b
    if "user_search_ctr" in df and "content_search_ctr" in df:
        a = pd.to_numeric(df["user_search_ctr"], errors="coerce").fillna(0.0) + 1e-6
        b = pd.to_numeric(df["content_search_ctr"], errors="coerce").fillna(0.0) + 1e-6
        df["user_over_content_search_ctr"] = a / b
    return df


def add_user_term_affinity(tr: pd.DataFrame, te: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Fit on train only: user_term_freq, user_term_unique_content, user_term_avg_price; merge to train/test."""
    required = ["user_id_hashed", "search_term_normalized"]
    for col in required:
        if col not in tr.columns or col not in te.columns:
            raise KeyError(f"Missing column '{col}' in train/test for user-term affinity")
    # ensure string keys
    tr["user_id_hashed"] = tr["user_id_hashed"].astype(str)
    te["user_id_hashed"] = te["user_id_hashed"].astype(str)
    tr["search_term_normalized"] = tr["search_term_normalized"].astype(str)
    te["search_term_normalized"] = te["search_term_normalized"].astype(str)

    cols = ["user_id_hashed", "search_term_normalized", "content_id_hashed"]
    if "selling_price" in tr.columns:
        cols.append("selling_price")
    g = tr[cols].groupby(["user_id_hashed", "search_term_normalized"])  # train-only fit
    stats = pd.DataFrame({
        "user_term_freq": g.size(),
        "user_term_unique_content": g["content_id_hashed"].nunique(),
    }).reset_index()
    if "selling_price" in cols:
        stats_price = g["selling_price"].mean().reset_index().rename(columns={"selling_price": "user_term_avg_price"})
        stats = stats.merge(stats_price, on=["user_id_hashed", "search_term_normalized"], how="left")

    # merge to train/test
    tr2 = tr.merge(stats, on=["user_id_hashed", "search_term_normalized"], how="left")
    te2 = te.merge(stats, on=["user_id_hashed", "search_term_normalized"], how="left")

    # fill types
    for c in ["user_term_freq", "user_term_unique_content"]:
        if c in tr2.columns:
            tr2[c] = pd.to_numeric(tr2[c], errors="coerce").fillna(0).astype(np.int32)
            te2[c] = pd.to_numeric(te2[c], errors="coerce").fillna(0).astype(np.int32)
    if "user_term_avg_price" in tr2.columns:
        med = pd.to_numeric(tr2["user_term_avg_price"], errors="coerce").median()
        tr2["user_term_avg_price"] = pd.to_numeric(tr2["user_term_avg_price"], errors="coerce").fillna(med)
        te2["user_term_avg_price"] = pd.to_numeric(te2["user_term_avg_price"], errors="coerce").fillna(med)

    return tr2, te2


def main():
    root = Path("C:/Projects/trendyol/data")
    train_p = root / "train_data_v2.parquet"
    test_p = root / "test_data_v2.parquet"

    out_train = root / "train_data_v3.parquet"
    out_test = root / "test_data_v3.parquet"

    _print("Loading v2…")
    tr = pl.read_parquet(str(train_p))
    te = pl.read_parquet(str(test_p)).to_pandas()

    tr_tr, tr_val = time_split(tr)
    tr_full = tr.to_pandas()

    _print("Fitting category price stats on train and transforming train/test…")
    state = fb_price_norm_by_category_fit(tr_tr)

    tr_tr = fb_price_norm_by_category_transform(tr_tr, state)
    tr_val = fb_price_norm_by_category_transform(tr_val, state)
    te = fb_price_norm_by_category_transform(te, state)

    _print("Adding session and time-decayed extras…")
    tr_full = fb_session_pos_extras(tr_full)
    failing_col = tr_full[["sess_price_mean", "sess_price_std", "sess_price_z"]].copy()

    tr_tr = fb_session_pos_extras(tr_tr)
    tr_val = fb_session_pos_extras(tr_val)
    te = fb_session_pos_extras(te)

    tr = pd.concat([tr_tr, tr_val], ignore_index=True)
    tr[["sess_price_mean", "sess_price_std", "sess_price_z"]] = failing_col
    _print("Adding user–term affinity (train-fit → train/test-merge)…")
    tr, te = add_user_term_affinity(tr, te)

    _print("Writing v2_plus (no reranker)…")
    tr.to_parquet(out_train, index=False)
    te.to_parquet(out_test, index=False)
    _print("Done.")


if __name__ == "__main__":
    main()


**Step 3 : Prodcut Text and Reranker Score by using gwen3**

In [None]:
# Step 3
from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd
import polars as pl
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


def _print(msg: str):
    print(f"[v2_plus_rerank] {msg}")


def choose_model(device: torch.device, prefer_qwen: bool = True):
    if device.type == "cuda" and prefer_qwen:
        return "Qwen/Qwen3-Reranker-0.6B"
    return "cross-encoder/ms-marco-MiniLM-L-6-v2"


def build_product_text(df: pd.DataFrame) -> pd.Series:
    parts: List[pd.Series] = []
    for c in ["leaf_category_name", "cv_tags", "content_name", "brand_name"]:
        if c in df.columns:
            parts.append(df[c].astype(str))
    if parts:
        out = parts[0]
        for s in parts[1:]:
            out = out + " " + s
        return out.fillna("")
    return df.get("content_id_hashed", pd.Series(["" for _ in range(len(df))])).astype(str).fillna("")


def score_batch(tokenizer, model, queries: List[str], docs: List[str], device: torch.device, max_len: int = 512):
    enc = tokenizer(queries, docs, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits
        if logits.ndim == 2:
            l = logits[:, 0] if logits.shape[1] == 1 else logits[:, -1]
        else:
            l = logits.view(-1)
        probs = torch.sigmoid(l).detach().float().cpu().numpy()
    return probs


def add_reranker_score(df: pd.DataFrame, tokenizer, model, device: torch.device, batch_size: int = 64, max_rows: Optional[int] = None) -> np.ndarray:
    if max_rows is not None:
        df = df.head(max_rows).copy()
    if "search_term_normalized" not in df.columns:
        raise KeyError("search_term_normalized column missing")
    df["search_term_normalized"] = df["search_term_normalized"].astype(str).fillna("")
    prod_text = build_product_text(df)
    n = len(df)
    scores = np.zeros(n, dtype=np.float32)
    for i in range(0, n, batch_size):
        j = min(i + batch_size, n)
        q = df["search_term_normalized"].iloc[i:j].tolist()
        d = prod_text.iloc[i:j].tolist()
        scores[i:j] = score_batch(tokenizer, model, q, d, device)
        if (i // batch_size) % 20 == 0:
            _print(f"scored {j}/{n}")
    return scores


def main():
    root = Path("C:/Projects/trendyol/data")
    
    train_p = root / "train_data_v3.parquet"
    test_p = root / "test_data_v3.parquet"

    out_train = root / "train_data_v4.parquet"
    out_test = root / "test_data_v4.parquet"

    _print("Loading v2_plus…")
    tr = pl.read_parquet(str(train_p)).to_pandas()
    te = pl.read_parquet(str(test_p)).to_pandas()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = choose_model(device, prefer_qwen=True)
    _print(f"Device={device.type}, model={model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    model.eval()

    # Persist product_text into the datasets before scoring
    tr["product_text"] = build_product_text(tr)
    te["product_text"] = build_product_text(te)

    _print("Scoring train…")
    tr_scores = add_reranker_score(tr, tokenizer, model, device, batch_size=64)
    _print("Scoring test…")
    te_scores = add_reranker_score(te, tokenizer, model, device, batch_size=64)

    tr["reranker_score"] = tr_scores
    te["reranker_score"] = te_scores

    _print("Writing back v2_plus with reranker_score…")
    tr.to_parquet(out_train, index=False)
    te.to_parquet(out_test, index=False)
    _print("Done.")


if __name__ == "__main__":
    main()


**Final Step : Product and user meta features, user-product interaction feature**

In [None]:
# Final Step

import pandas as pd
import numpy as np
from pathlib import Path
import os
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

print("Starting all feature engineering steps...")
input_path = Path("C:/Projects/trendyol/data")

# --- 1. Load Raw Data ---
print("[1/5] Reading raw data files...")
try:
    df_train_sessions = pd.read_parquet(input_path / 'train_data_v4.parquet')
    df_test_sessions = pd.read_parquet(input_path / 'test_data_v4.parquet')

    df_content_meta = pd.read_parquet('data/content/metadata.parquet')
    df_price = pd.read_parquet('data/content/price_rate_review_data.parquet')
    df_user_meta = pd.read_parquet('data/user/metadata.parquet')

    # Needed interaction log file for new feature
    df_interaction_logs = pd.read_parquet('data/user/fashion_sitewide_log.parquet')
except FileNotFoundError as e:
    print(f"ERROR: File not found! Please check the file path: {e}")
    exit()

# --- 2. Build Product & User Meta Features ---
print("[2/5] Creating product age, discount percentage and user age features...")

# Determine the 'present time' across datasets
present_time = pd.concat([df_train_sessions['ts_hour'], df_test_sessions['ts_hour']]).max()

# Compute product age in days
df_content_meta['product_age_days'] = (present_time - df_content_meta['content_creation_date']).dt.days

# Compute discount percentage (guard against original_price < selling_price)
df_price['original_price'] = np.where(df_price['original_price'] < df_price['selling_price'], df_price['selling_price'], df_price['original_price'])
df_price['discount_percentage'] = np.where(
    df_price['original_price'] > 0,
    (df_price['original_price'] - df_price['selling_price']) / df_price['original_price'],
    0
) * 100
# Keep the most recent price record per content
df_price = df_price.sort_values('update_date', ascending=False).drop_duplicates('content_id_hashed')

# Compute user age
present_year = present_time.year
df_user_meta['user_age'] = present_year - df_user_meta['user_birth_year']

# Prepare feature subsets for merging
df_product_age = df_content_meta[['content_id_hashed', 'product_age_days']]
df_discount = df_price[['content_id_hashed', 'discount_percentage']]
df_user_age = df_user_meta[['user_id_hashed', 'user_age']]

# --- 3. Merge Meta Features Into Main Datasets ---
print("[3/5] Merging meta features into main datasets...")

def enrich_with_metadata(df, df_product_age, df_discount, df_user_age):
    df_enriched = pd.merge(df, df_product_age, on='content_id_hashed', how='left')
    df_enriched = pd.merge(df_enriched, df_discount, on='content_id_hashed', how='left')
    df_enriched = pd.merge(df_enriched, df_user_age, on='user_id_hashed', how='left')

    # Fill NaNs
    df_enriched['product_age_days'].fillna(0, inplace=True)
    df_enriched['discount_percentage'].fillna(0, inplace=True)
    df_enriched['user_age'].fillna(-1, inplace=True)
    return df_enriched

train_df_v2 = enrich_with_metadata(df_train_sessions, df_product_age, df_discount, df_user_age)
test_df_v2 = enrich_with_metadata(df_test_sessions, df_product_age, df_discount, df_user_age)
print("Meta features added.")


# --- 4. Safely Add Historical User-Content Interaction Features ---
print("[4/5] Adding historical user-content interaction features (merge_asof)...")

def add_user_content_interaction_features(df_sessions, df_interaction_logs):
    logs = df_interaction_logs[['user_id_hashed', 'content_id_hashed', 'ts_hour', 'total_click']].copy()
    sessions = df_sessions.copy()

    logs['ts_hour'] = pd.to_datetime(logs['ts_hour'])
    sessions['ts_hour'] = pd.to_datetime(sessions['ts_hour'])

    logs = logs.sort_values('ts_hour')
    # Binary click event: 1 if total_click > 0 else 0
    logs['click_event'] = (logs['total_click'] > 0).astype(int)
    logs['user_content_past_click_count'] = logs.groupby(['user_id_hashed', 'content_id_hashed'])['click_event'].cumsum()

    sessions = sessions.sort_values('ts_hour')

    df_enriched = pd.merge_asof(
        left=sessions,
        right=logs[['user_id_hashed', 'content_id_hashed', 'ts_hour', 'user_content_past_click_count']],
        on='ts_hour',
        by=['user_id_hashed', 'content_id_hashed'],
        direction='backward'
    )

    df_enriched['user_content_past_click_count'].fillna(0, inplace=True)
    return df_enriched

# Apply interaction feature augmentation
train_df_v3 = add_user_content_interaction_features(train_df_v2, df_interaction_logs)
test_df_v3 = add_user_content_interaction_features(test_df_v2, df_interaction_logs)
print("Historical interaction features added.")


# --- 5. Save Results ---
print("[5/5] Saving final enriched datasets...")

output_path = input_path
train_output_path = output_path / 'train_data_v5.parquet'
test_output_path = output_path / 'test_data_v5.parquet'

train_df_v3.to_parquet(train_output_path, index=False)
test_df_v3.to_parquet(test_output_path, index=False)

print("\nProcess completed! 🚀")
print(f"Train data saved: {train_output_path}")
print(f"Test data saved: {test_output_path}")

print("\nFinal state of train set (first 5 rows):")
print(train_df_v3.head())
print("\nTrain set info:")
print(train_df_v3.info())
