# Next-Purchase Prediction (Ranking) 

**Purpose.** Predict a user’s **next service** and rank the top candidates at each anchor time *t* (“what will they buy next?”).

**This notebook does:**
- Builds a **fold-aware ranking dataset** from purchases (UTC).
- Trains **LightGBM LambdaRank** on time-based splits.
- Evaluates on a hold-out month with **Top-K** metrics (Top-1/Top-2/Hit@3/MRR@3/NDCG@3).
- Exports feature importance, per-segment diagnostics, and hold-out predictions.
> ℹ️ **Pandas warning:** `infer_datetime_format` is deprecated; the notebook already parses timestamps safely without it. You can remove that argument if you still see a local warning.

---

In [None]:
# Config
from pathlib import Path

DATA_DIR = Path('/Users/tree/.Trash/tubitakaiagentprojeleriiinverisetleri')             # folder with Purchase.csv
PURCHASE_FILE = 'Purchase.csv'   # v3 purchase file
TIME_COL = 'ordercreatedtime'    # fallback to 'event_time' if missing
USER_ID_CANDS = ['ownerid','user_id','id']
SERVICETYPE_CANDS = ['serviceType','servicetype','service_type']

# Burst de-dup (lowered as requested)
BURST_WINDOW_MINUTES = 30

# Horizon classification targets
HORIZONS_DAYS = [7, 14, 30]

# Negative sampling for ranking
N_NEG = 10
N_NEG_UNIFORM = 7
N_NEG_POP = 3

# Recency windows for features (days)
RECENCY_WINDOWS = [7, 30, 90]

# Recency half-life for Markov prior (days)
HALFLIFE_DAYS = 90


In [None]:
# === Imports & helpers ===
import pandas as pd, numpy as np
from pathlib import Path
from collections import defaultdict, Counter

def coalesce_id(df, candidates):
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series([np.nan]*len(df))

def parse_time_utc(s, local_tz='Europe/Istanbul'):
    ts = pd.to_datetime(s, errors='coerce', utc=True, infer_datetime_format=True)
    if getattr(ts.dt, 'tz', None) is None:
        ts = pd.to_datetime(s, errors='coerce')
        ts = ts.dt.tz_localize(local_tz, ambiguous='NaT', nonexistent='NaT').dt.tz_convert('UTC')
    return ts

def norm_service(series: pd.Series) -> pd.Series:
    out = series.astype(str).str.strip().str.lower()
    out = out.replace({'nan': np.nan, 'none': np.nan, 'nat': np.nan, '': np.nan,
                       'walk & care': 'walkandcare', 'walk_and_care': 'walkandcare'})
    return out

def dedup_bursts(df, window_min=30):
    df = df.copy()
    gap = (df['purchase_time'] - df.groupby('user_id')['purchase_time'].shift(1)).dt.total_seconds()/60.0
    new_burst = (gap.isna()) | (gap > window_min)
    burst_id = new_burst.groupby(df['user_id']).cumsum()
    first = df.groupby(['user_id', burst_id], as_index=False).first()
    first = first.rename(columns={'purchase_time':'purchase_time'})
    first['burst_size'] = df.groupby(['user_id', burst_id]).size().values
    for col in ['amount','total','price']:
        if col in df.columns:
            first[col] = df.groupby(['user_id', burst_id])[col].sum().values
    return first.reset_index(drop=True)

def recency_weight(age_days, halflife_days):
    return np.power(0.5, np.clip(age_days, 0, None) / max(halflife_days, 1e-9))


In [4]:
# === Load & normalize purchases ===
p = pd.read_csv(DATA_DIR / PURCHASE_FILE)

p['user_id'] = coalesce_id(p, USER_ID_CANDS).astype(str).replace({'nan': np.nan})
if TIME_COL not in p.columns:
    alt = 'event_time' if 'event_time' in p.columns else None
    if not alt: raise KeyError(f"Time col '{TIME_COL}' not found and no 'event_time' fallback present.")
    p['purchase_time'] = parse_time_utc(p[alt])
else:
    p['purchase_time'] = parse_time_utc(p[TIME_COL])

st_col = None
for c in SERVICETYPE_CANDS:
    if c in p.columns:
        st_col = c; break
p['serviceType'] = norm_service(p[st_col]) if st_col else np.nan

CTX_COLS = [c for c in ['petType','platform','city','discount','promo','amount','total','price'] if c in p.columns]

p = p.dropna(subset=['user_id','purchase_time']).copy()
p['user_id'] = p['user_id'].astype(str)

p = p.sort_values(['user_id','purchase_time']).reset_index(drop=True)
p = dedup_bursts(p, window_min=BURST_WINDOW_MINUTES)

print('Rows after dedup:', len(p), '| Users:', p['user_id'].nunique(),
      '| Time:', p['purchase_time'].min(), '→', p['purchase_time'].max())

Rows after dedup: 12810 | Users: 1614 | Time: 2024-12-24 21:41:02.022000+00:00 → 2025-08-04 09:34:52.801000+00:00


  ts = pd.to_datetime(s, errors='coerce', utc=True, infer_datetime_format=True)


In [5]:
# === Build rolling time folds ===
max_ts = p['purchase_time'].max()
anchor = pd.Timestamp(year=max_ts.year, month=max_ts.month, day=1, tz='UTC') + pd.offsets.MonthBegin(1)
month_starts = [(anchor - pd.offsets.MonthBegin(k)).tz_convert('UTC') for k in range(4,0,-1)]

folds = []
if len(month_starts) >= 3:
    folds.append({'train_end': month_starts[1], 'val_start': month_starts[1], 'val_end': month_starts[2]})
    folds.append({'train_end': month_starts[2], 'val_start': month_starts[2], 'val_end': month_starts[3]})
holdout = {'train_end': month_starts[3], 'test_start': month_starts[3], 'test_end': anchor}

folds, holdout

([{'train_end': Timestamp('2025-06-01 00:00:00+0000', tz='UTC'),
   'val_start': Timestamp('2025-06-01 00:00:00+0000', tz='UTC'),
   'val_end': Timestamp('2025-07-01 00:00:00+0000', tz='UTC')},
  {'train_end': Timestamp('2025-07-01 00:00:00+0000', tz='UTC'),
   'val_start': Timestamp('2025-07-01 00:00:00+0000', tz='UTC'),
   'val_end': Timestamp('2025-08-01 00:00:00+0000', tz='UTC')}],
 {'train_end': Timestamp('2025-08-01 00:00:00+0000', tz='UTC'),
  'test_start': Timestamp('2025-08-01 00:00:00+0000', tz='UTC'),
  'test_end': Timestamp('2025-09-01 00:00:00+0000', tz='UTC')})

In [6]:
# === Utilities ===
def slice_time(df, start=None, end=None):
    m = df
    if start is not None:
        m = m[m['purchase_time'] >= start]
    if end is not None:
        m = m[m['purchase_time'] < end]
    return m.copy()

from collections import defaultdict
def build_markov_prior(train_df, halflife_days=90):
    d = train_df.copy()
    d['next_service'] = d.groupby('user_id')['serviceType'].shift(-1)
    d['this_time'] = d['purchase_time']
    d['next_time'] = d.groupby('user_id')['purchase_time'].shift(-1)
    d = d.dropna(subset=['serviceType','next_service','this_time','next_time'])
    train_max = d['this_time'].max()
    d['age_days'] = (train_max - d['this_time']).dt.total_seconds()/(24*3600)
    d['w'] = recency_weight(d['age_days'], halflife_days)
    rw = d.groupby(['serviceType','next_service'])['w'].sum().rename('w_count').reset_index()
    tot = rw.groupby('serviceType')['w_count'].transform('sum')
    rw['prob'] = np.where(tot>0, rw['w_count']/tot, np.nan)
    prior = defaultdict(dict)
    for _, r in rw.iterrows():
        prior[r['serviceType']][r['next_service']] = r['prob']
    return prior, rw

def service_popularity(train_df):
    return train_df['serviceType'].value_counts(normalize=True).to_dict(), train_df['serviceType'].value_counts().to_dict()

def build_feature_row(user_hist, t, candidate, current, ctx_row=None, windows=[7,30,90]):
    feats = {}
    if not user_hist.empty:
        last_time = user_hist['purchase_time'].max()
        feats['recency_days'] = (t - last_time).total_seconds()/(24*3600)
    else:
        feats['recency_days'] = np.nan
    for w in windows:
        feats[f'cnt_{w}d'] = (user_hist['purchase_time'] >= (t - pd.Timedelta(days=w))).sum()
    uh_cand = user_hist[user_hist['serviceType']==candidate]
    feats['cand_has_hist'] = int(len(uh_cand) > 0)
    feats['cand_recency_days'] = (t - uh_cand['purchase_time'].max()).total_seconds()/(24*3600) if len(uh_cand) else np.nan
    for w in windows:
        feats[f'cand_cnt_{w}d'] = (uh_cand['purchase_time'] >= (t - pd.Timedelta(days=w))).sum()
    feats['last_service'] = user_hist.sort_values('purchase_time').iloc[-1]['serviceType'] if not user_hist.empty else np.nan
    local = t.tz_convert('Europe/Istanbul')
    feats['dow'] = local.dayofweek; feats['hour'] = local.hour
    if ctx_row is not None:
        for c in ctx_row.index:
            feats[f'ctx_{c}'] = ctx_row[c]
    return feats


In [7]:
# === Dataset constructors ===
rng = np.random.default_rng(42)

def sample_negatives(all_services, positive, pop_probs, n_total=10, n_uniform=7, n_pop=3):
    pool = [s for s in all_services if s != positive]
    uni = rng.choice(pool, size=min(n_uniform, len(pool)), replace=False) if len(pool) else []
    if n_pop and len(pool):
        weights = np.array([pop_probs.get(s, 0.0) for s in pool], dtype=float)
        if weights.sum() == 0:
            pop = []
        else:
            weights = weights/weights.sum()
            pop = rng.choice(pool, size=min(n_pop, len(pool)), replace=False, p=weights)
    else:
        pop = []
    cands = list(dict.fromkeys(list(uni) + list(pop)))
    while len(cands) < n_total and len(cands) < len(pool):
        add = rng.choice([s for s in pool if s not in cands])
        cands.append(add)
    return cands

def build_ranking_dataset(train_df, val_df, markov_prior, all_services, pop_probs):
    rows = []
    by_user_train = {u: df.sort_values('purchase_time') for u, df in train_df.groupby('user_id')}
    # Build user sequences from combined train+val to get the "true next" in val period
    by_user_val = {u: df.sort_values('purchase_time') for u, df in val_df.groupby('user_id')}
    for u, vdf in by_user_val.items():
        tdf = by_user_train.get(u, pd.DataFrame(columns=train_df.columns))
        for idx, row in vdf.iterrows():
            t = row['purchase_time']; cur = row['serviceType']
            # user history strictly before t from TRAIN
            hist = tdf[tdf['purchase_time'] < t]
            # next in VAL for this user
            future = vdf[vdf['purchase_time'] > t]
            if future.empty: 
                continue
            y_pos = future.iloc[0]['serviceType']
            ctx_cols = [c for c in ['petType','platform','city','discount','promo'] if c in vdf.columns]
            ctx_vals = row[ctx_cols] if ctx_cols else None
            feats_pos = build_feature_row(hist, t, y_pos, cur, ctx_vals)
            feats_pos['markov_prior'] = markov_prior.get(cur, {}).get(y_pos, np.nan)
            rows.append({'user_id': u, 'time': t, 'current': cur, 'candidate': y_pos, 'label': 1, **feats_pos})
            negs = sample_negatives(all_services, y_pos, pop_probs)
            for neg in negs:
                feats_neg = build_feature_row(hist, t, neg, cur, ctx_vals)
                feats_neg['markov_prior'] = markov_prior.get(cur, {}).get(neg, np.nan)
                rows.append({'user_id': u, 'time': t, 'current': cur, 'candidate': neg, 'label': 0, **feats_neg})
    return pd.DataFrame(rows)

def build_horizon_dataset(train_df, val_df, all_services, horizons=[7,14,30]):
    rows = []
    by_user = {u: df.sort_values('purchase_time') for u, df in pd.concat([train_df, val_df]).groupby('user_id')}
    for u, vdf in val_df.groupby('user_id'):
        user_all = by_user.get(u, pd.DataFrame(columns=train_df.columns))
        vdf = vdf.sort_values('purchase_time')
        for _, row in vdf.iterrows():
            t = row['purchase_time']; cur = row['serviceType']
            hist = user_all[user_all['purchase_time'] < t]
            future = user_all[user_all['purchase_time'] > t][['serviceType', 'purchase_time']]
            future_first = future.groupby('serviceType', as_index=True)['purchase_time'].min()
            ctx_cols = [c for c in ['petType','platform','city','discount','promo'] if c in vdf.columns]
            ctx_vals = row[ctx_cols] if ctx_cols else None
            for cand in all_services:
                feats = build_feature_row(hist, t, cand, cur, ctx_vals)
                ft = future_first.get(cand, pd.NaT)
                for T in horizons:
                    cutoff = t + pd.Timedelta(days=int(T))
                    label = int(pd.notna(ft) and (ft <= cutoff))
                    rows.append({
                        'user_id': u,
                        'time': t,
                        'current': cur,
                        'candidate': cand,
                        'T_days': T,
                        'label': label,
                    })
    return pd.DataFrame(rows)

In [8]:
# === Execute folds and write CSVs ===
from pathlib import Path
OUT_DIR = Path('datasets_v4'); OUT_DIR.mkdir(exist_ok=True)
all_services = sorted(p['serviceType'].dropna().unique().tolist())

for i, f in enumerate(folds, start=1):
    train = slice_time(p, end=f['train_end'])
    val   = slice_time(p, start=f['val_start'], end=f['val_end'])
    markov_prior, rw = build_markov_prior(train, halflife_days=HALFLIFE_DAYS)
    pop_probs, pop_counts = service_popularity(train)
    rank_df = build_ranking_dataset(train, val, markov_prior, all_services, pop_probs)
    horiz_df = build_horizon_dataset(train, val, all_services, horizons=HORIZONS_DAYS)
    rank_df.to_csv(OUT_DIR / f'fold{i}_ranking.csv', index=False)
    horiz_df.to_csv(OUT_DIR / f'fold{i}_horizon.csv', index=False)
    rw.to_csv(OUT_DIR / f'fold{i}_markov_prior.csv', index=False)
    print(f'Fold {i}: rank_rows={len(rank_df):,} | horiz_rows={len(horiz_df):,} | services={len(all_services)}')

# Holdout
train_all = slice_time(p, end=holdout['train_end'])
test      = slice_time(p, start=holdout['test_start'], end=holdout['test_end'])
markov_prior_H, rw_H = build_markov_prior(train_all, halflife_days=HALFLIFE_DAYS)
pop_probs_H, _ = service_popularity(train_all)
rank_test  = build_ranking_dataset(train_all, test, markov_prior_H, all_services, pop_probs_H)
horiz_test = build_horizon_dataset(train_all, test, all_services, horizons=HORIZONS_DAYS)
rank_test.to_csv(OUT_DIR / 'holdout_ranking.csv', index=False)
horiz_test.to_csv(OUT_DIR / 'holdout_horizon.csv', index=False)
rw_H.to_csv(OUT_DIR / 'holdout_markov_prior.csv', index=False)
print('Holdout:', len(rank_test), 'ranking rows |', len(horiz_test), 'horizon rows')
print('Saved to', OUT_DIR.resolve())

Fold 1: rank_rows=32,360 | horiz_rows=126,330 | services=10
Fold 2: rank_rows=29,050 | horiz_rows=113,070 | services=10
Holdout: 1450 ranking rows | 11310 horizon rows
Saved to /Users/tree/Projects/tubitak-ai-agent/notebooks/datasets_v4


## Optional: Shift holdout if the last month is incomplete

In [None]:

# If the holdout period has very few events (e.g., partial month), shift anchor back by one month.
MIN_TEST_ROWS = 350  #adjustable
test_rows = len(slice_time(p, start=holdout['test_start'], end=holdout['test_end']))
print('Holdout rows (current):', test_rows)
if test_rows < MIN_TEST_ROWS:
    print('Holdout too small; shifting anchor back by one month.')
    new_anchor = anchor - pd.offsets.MonthBegin(1)
    month_starts = [(new_anchor - pd.offsets.MonthBegin(k)).tz_convert('UTC') for k in range(4,0,-1)]
    folds = []
    if len(month_starts) >= 3:
        folds.append({'train_end': month_starts[1], 'val_start': month_starts[1], 'val_end': month_starts[2]})
        folds.append({'train_end': month_starts[2], 'val_start': month_starts[2], 'val_end': month_starts[3]})
    holdout = {'train_end': month_starts[3], 'test_start': month_starts[3], 'test_end': new_anchor}
    print('Shifted folds and holdout to ensure a fuller test month.')

Holdout rows (current): 3769



# Training — A) Next-service Ranking (LightGBM / XGBoost)
This section trains a **pairwise ranking model** on the datasets built above.

**Features included:** recency windows, candidate-specific counts/recency, last service, seasonality, Markov prior, plus any available context columns (petType/platform/city/discount/promo).


In [50]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import ndcg_score

DATASETS = Path("datasets_v4")
RANK_FILES = ["fold1_ranking.csv", "fold2_ranking.csv"]
HOLDOUT_RANK = "holdout_ranking.csv"
# Utility to load a ranking csv

def load_rank_df(name: str) -> pd.DataFrame:
    df = pd.read_csv(DATASETS / name, parse_dates=["time"])  # time parsed for anchor_id
    # build anchor_id (1 anchor = one query context) if missing
    if "anchor_id" not in df.columns:
        df["anchor_id"] = df["user_id"].astype(str) + "|" + df["time"].astype("int64").astype(str)
    return df

In [51]:
# Load training folds (concat) and holdout
train_raw = pd.concat([load_rank_df(f) for f in RANK_FILES if (DATASETS / f).exists()], ignore_index=True)
test_raw  = load_rank_df(HOLDOUT_RANK)

print("Train rows:", len(train_raw), "| Holdout rows:", len(test_raw))
print("Train anchors:", train_raw["anchor_id"].nunique(), "| Holdout anchors:", test_raw["anchor_id"].nunique())

# Identify features
cat_cols = [c for c in ["current", "candidate", "last_service"] if c in train_raw.columns]
ctx_cols = [c for c in train_raw.columns if c.startswith("ctx_")]
exclude = set(["user_id", "time", "anchor_id", "label"]) | set(cat_cols) | set(ctx_cols)
num_cols = [c for c in train_raw.columns if c not in exclude]
print("Categorical:", cat_cols + ctx_cols)
print("Numeric:", num_cols)

# Keep copies for baselines before encoding
train_text = train_raw[["anchor_id", "candidate"]].copy()

# Fit a single OrdinalEncoder on (cat + ctx) over TRAIN, then transform TRAIN and TEST
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
if cat_cols + ctx_cols:
    train_enc = train_raw.copy()
    test_enc  = test_raw.copy()

    train_enc[cat_cols + ctx_cols] = enc.fit_transform(train_enc[cat_cols + ctx_cols].astype(str))
    test_enc[cat_cols + ctx_cols]  = enc.transform(test_enc[cat_cols + ctx_cols].astype(str))
else:
    train_enc, test_enc = train_raw.copy(), test_raw.copy()

Train rows: 61410 | Holdout rows: 1450
Train anchors: 6141 | Holdout anchors: 145
Categorical: ['current', 'candidate', 'last_service']
Numeric: ['recency_days', 'cnt_7d', 'cnt_30d', 'cnt_90d', 'cand_has_hist', 'cand_recency_days', 'cand_cnt_7d', 'cand_cnt_30d', 'cand_cnt_90d', 'dow', 'hour', 'markov_prior']


In [52]:
# Feature engineering — log1p counts + ratios vs 90d
for base in ['cnt','cand_cnt']:
    for win in [7, 30, 90]:
        col = f'{base}_{win}d'
        if col in train_enc.columns:
            train_enc[col] = np.log1p(train_enc[col])
        if col in test_enc.columns:
            test_enc[col] = np.log1p(test_enc[col])

# Short/long ratios (guard for missing columns)
new_feats = []
for base in ['cnt','cand_cnt']:
    if all(f'{base}_{w}d' in train_enc.columns for w in [7,90]):
        train_enc[f'{base}_r_7_90'] = (train_enc[f'{base}_7d']+1)/(train_enc[f'{base}_90d']+1)
        test_enc[f'{base}_r_7_90']  = (test_enc[f'{base}_7d']+1)/(test_enc[f'{base}_90d']+1)
        new_feats.append(f'{base}_r_7_90')
    if all(f'{base}_{w}d' in train_enc.columns for w in [30,90]):
        train_enc[f'{base}_r_30_90'] = (train_enc[f'{base}_30d']+1)/(train_enc[f'{base}_90d']+1)
        test_enc[f'{base}_r_30_90']  = (test_enc[f'{base}_30d']+1)/(test_enc[f'{base}_90d']+1)
        new_feats.append(f'{base}_r_30_90')

for f in new_feats:
    if f not in num_cols:
        num_cols.append(f)

# Prepare arrays for LightGBM LambdaRank
X_train = train_enc[cat_cols + ctx_cols + num_cols].astype(float).fillna(-1.0).values
y_train = train_enc["label"].astype(int).values
groups_train = train_enc.groupby("anchor_id").size().values

X_test = test_enc[cat_cols + ctx_cols + num_cols].astype(float).fillna(-1.0).values
y_test = test_enc["label"].astype(int).values
groups_test = test_enc.groupby("anchor_id").size().values

print("Train shapes:", X_train.shape, len(groups_train), "| Test shapes:", X_test.shape, len(groups_test))

Train shapes: (61410, 19) 6141 | Test shapes: (1450, 19) 145


In [53]:
# Metrics helpers
def group_metrics(df: pd.DataFrame, score_col: str = "score") -> dict:
    top1 = []; top2 = []; hit3 = []; mrr3 = []; ndcg3 = []
    for _, g in df.groupby("anchor_id"):
        g = g.sort_values(score_col, ascending=False)
        y = g["label"].values
        top1.append(1.0 if y[0] == 1 else 0.0)
        top2.append(1.0 if y[:2].sum() > 0 else 0.0)
        hit3.append(1.0 if y[:3].sum() > 0 else 0.0)
        if 1 in y[:3]:
            r = np.argmax(y[:3] == 1) + 1
            mrr3.append(1.0 / r)
        else:
            mrr3.append(0.0)
        ndcg3.append(ndcg_score([y[:3]], [g[score_col].values[:3]]))
    return {
        "top1": float(np.mean(top1)),
        "top2": float(np.mean(top2)),
        "hit3": float(np.mean(hit3)),
        "mrr3": float(np.mean(mrr3)),
        "ndcg3": float(np.mean(ndcg3)),
        "anchors": int(df["anchor_id"].nunique()),
    }

def ndcg_full(df: pd.DataFrame, score_col: str = "score") -> float:
    vals = []
    for _, g in df.groupby("anchor_id"):
        vals.append(ndcg_score([g["label"].values], [g[score_col].values]))
    return float(np.mean(vals))

# generic predictor attach (avoids duplicate 'score')

def with_score(df: pd.DataFrame, model, X, col: str = "score") -> pd.DataFrame:
    df2 = df.copy()
    if "anchor_id" not in df2.columns:
        df2["anchor_id"] = df2["user_id"].astype(str) + "|" + df2["time"].astype("int64").astype(str)
    df2 = df2.loc[:, ~df2.columns.duplicated()]
    for c in list(df2.columns[df2.columns == col]):
        df2 = df2.drop(columns=c)
    num_iter = getattr(model, "best_iteration", None)
    df2[col] = model.predict(X, num_iteration=num_iter)
    return df2

# decode helper for encoded columns

def inverse_col_from_encoder(series_num: pd.Series, col_name: str, enc: OrdinalEncoder, cols: list) -> pd.Series:
    if col_name not in cols:
        return series_num
    idx = cols.index(col_name)
    cats = list(enc.categories_[idx])
    s = pd.to_numeric(series_num, errors="coerce").where(lambda x: x >= 0, np.nan).round().astype("Int64")
    return s.map(lambda i: cats[int(i)] if pd.notna(i) and int(i) < len(cats) else np.nan).astype("object")

In [54]:
# LightGBM LambdaRank
import lightgbm as lgb

params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3],
    learning_rate=0.03,
    num_leaves=31,
    min_data_in_leaf=20,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    max_depth=-1,
    verbose=-1,
)

train_set = lgb.Dataset(X_train, label=y_train, group=groups_train, free_raw_data=False)
valid_set = lgb.Dataset(X_test, label=y_test, group=groups_test, reference=train_set, free_raw_data=False)

print("Training until validation scores don't improve for 50 rounds")
model_lgb = lgb.train(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[valid_set],
    valid_names=["valid"],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=50)],
)
# Attach predictions on holdout and compute metrics
hold_lgb = with_score(test_enc, model_lgb, X_test, col="score")
metrics_lgb = group_metrics(hold_lgb, "score"); metrics_lgb["ndcg_full"] = ndcg_full(hold_lgb, "score")
print({k: round(v, 6) if isinstance(v, float) else v for k, v in metrics_lgb.items()})



Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[50]	valid's ndcg@1: 0.737931	valid's ndcg@3: 0.900569
Early stopping, best iteration is:
[8]	valid's ndcg@1: 0.751724	valid's ndcg@3: 0.90566
{'top1': 0.751724, 'top2': 0.97931, 'hit3': 1.0, 'mrr3': 0.872414, 'ndcg3': 0.90566, 'anchors': 145, 'ndcg_full': 0.90566}


In [55]:
#Baselines: Markov-only and Popularity-only
#Uses decoded candidate/current names to avoid numeric-vs-text mismatches.
cols_for_encoder = cat_cols + ctx_cols
base_df = hold_lgb.copy()

# decode candidate/current if they were encoded
if "candidate" in base_df.columns and "candidate" in cat_cols:
    base_df["candidate_text"] = inverse_col_from_encoder(base_df["candidate"], "candidate", enc, cols_for_encoder)
if "current" in base_df.columns and "current" in cat_cols:
    base_df["current_text"] = inverse_col_from_encoder(base_df["current"], "current", enc, cols_for_encoder)

# Markov-only baseline: prefer markov_prior already in holdout csv
mkv_df = base_df.copy()
if "markov_prior" not in mkv_df.columns:
    # Merge from raw holdout file if not present (joins on anchor_id + textual candidate)
    raw_hold = load_rank_df(HOLDOUT_RANK)  # raw has textual candidate
    mkv_df = mkv_df.merge(
        raw_hold[["anchor_id", "candidate", "markov_prior"]],
        left_on=["anchor_id", "candidate_text"],
        right_on=["anchor_id", "candidate"],
        how="left",
    )

mkv_df["score_mkv"] = mkv_df["markov_prior"].fillna(0.0)
metrics_mkv = group_metrics(mkv_df, "score_mkv"); metrics_mkv["ndcg_full"] = ndcg_full(mkv_df, "score_mkv")

# Popularity-only baseline
try:
    pop_map = pop_probs_H  # dict[str -> prob]
except Exception:
    pop_map = train_raw["candidate"].value_counts(normalize=True).to_dict()

pop_df = base_df.copy()
cand_key = "candidate_text" if "candidate_text" in pop_df.columns else "candidate"
pop_df["score_pop"] = pop_df[cand_key].map(pop_map).fillna(0.0)

# Tie rate on popularity (ties make Top-1 unstable)
ties = sum(g["score_pop"].nunique() <= 1 for _, g in pop_df.groupby("anchor_id"))
print(f"Popularity ties: {ties}/{pop_df['anchor_id'].nunique()} anchors ({ties / max(pop_df['anchor_id'].nunique(),1):.1%})")

metrics_pop = group_metrics(pop_df, "score_pop"); metrics_pop["ndcg_full"] = ndcg_full(pop_df, "score_pop")

baseline_table = pd.DataFrame([
    {"model": "LightGBM", **metrics_lgb},
    {"model": "Markov-only", **metrics_mkv},
    {"model": "Popularity-only", **metrics_pop},
]).set_index("model").round(4)
display(baseline_table)


Popularity ties: 0/145 anchors (0.0%)


Unnamed: 0_level_0,top1,top2,hit3,mrr3,ndcg3,anchors,ndcg_full
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LightGBM,0.7517,0.9793,1.0,0.8724,0.9057,145,0.9057
Markov-only,0.731,0.9793,0.9931,0.8598,0.8946,145,0.8972
Popularity-only,0.669,0.9655,0.9724,0.8195,0.8595,145,0.8701


In [57]:
import math, numpy as np, pandas as pd

# Ensure lgb_df (LightGBM predictions) exists
try:
    _ = lgb_df.shape
except NameError:
    if 'hold_lgb' in globals():
        lgb_df = hold_lgb.copy()
    else:
        lgb_df = with_score(test_enc, model_lgb, X_test, col='score')
    lgb_df = lgb_df.loc[:, ~lgb_df.columns.duplicated()]
    if 'anchor_id' not in lgb_df.columns:
        lgb_df['anchor_id'] = lgb_df['user_id'].astype(str) + '|' + lgb_df['time'].astype('int64').astype(str)

# Ensure mkv_df (Markov baseline) exists with score_mkv
try:
    _ = mkv_df.shape
except NameError:
    mkv_df = lgb_df.copy()
    if 'markov_prior' not in mkv_df.columns or mkv_df['markov_prior'].isna().all():
        raw_hold = load_rank_df(HOLDOUT_RANK)
        key_left = 'candidate_text' if 'candidate_text' in mkv_df.columns else 'candidate'
        mkv_df = mkv_df.merge(
            raw_hold[['anchor_id','candidate','markov_prior']],
            left_on=['anchor_id', key_left], right_on=['anchor_id','candidate'],
            how='left'
        )
        if key_left != 'candidate' and 'candidate' in mkv_df.columns:
            mkv_df = mkv_df.drop(columns=['candidate'])
    mkv_df['score_mkv'] = mkv_df['markov_prior'].fillna(0.0)

# Helper: per-anchor Top-1

def per_anchor_top1(df, score_col):
    vals = {}
    for aid, g in df.groupby('anchor_id'):
        g = g.sort_values(score_col, ascending=False)
        vals[aid] = int(g['label'].iloc[0] == 1)
    return vals

lgb_t1 = per_anchor_top1(lgb_df, 'score')
mkv_t1 = per_anchor_top1(mkv_df, 'score_mkv')

wins = sum(lgb_t1[a] > mkv_t1.get(a, 0) for a in lgb_t1)
loss = sum(lgb_t1[a] < mkv_t1.get(a, 0) for a in lgb_t1)

d = wins + loss
if d == 0:
    p_exact = 1.0
else:
    k = min(wins, loss)
    # two-sided exact McNemar via binomial on discordant pairs
    from math import comb
    p_exact = 2 * sum(comb(d, i) * (0.5 ** d) for i in range(0, k+1))
    p_exact = float(min(1.0, p_exact))

print(f"McNemar exact p-value: {p_exact:.4f} | wins: {wins} | loss: {loss} | discordant: {d}")

McNemar exact p-value: 0.6291 | wins: 10 | loss: 7 | discordant: 17


In [58]:
# Blend LightGBM with Markov prior (alpha tuning)
import numpy as np, pandas as pd

def ensure_markov(df_base):
    df = df_base.copy()
    if 'markov_prior' in df.columns:
        return df
    # Try mkv_df first (if created in baselines cell)
    try:
        src_mkv = mkv_df[['anchor_id','candidate','markov_prior']].copy()
        if 'candidate_text' in df.columns and 'candidate_text' in mkv_df.columns:
            df = df.merge(src_mkv.rename(columns={'candidate':'candidate_text'}),
                          on=['anchor_id','candidate_text'], how='left')
        else:
            df = df.merge(src_mkv, on=['anchor_id','candidate'], how='left')
        return df
    except Exception:
        pass
    # Fallback to raw holdout file
    raw_hold = load_rank_df(HOLDOUT_RANK)
    key = 'candidate_text' if 'candidate_text' in df.columns else 'candidate'
    df = df.merge(raw_hold[['anchor_id','candidate','markov_prior']],
                  left_on=['anchor_id', key], right_on=['anchor_id','candidate'],
                  how='left', suffixes=('','_raw'))
    if 'candidate_raw' in df.columns:
        df = df.drop(columns=['candidate_raw'])
    return df

# start from the LGB-scored holdout built earlier
blend_df = ensure_markov(base_df if 'base_df' in globals() else hold_lgb)

blend_df = blend_df.loc[:, ~blend_df.columns.duplicated()]


alphas = [0.0, 0.2, 0.3, 0.5, 0.7, 0.85, 1.0]
rows = []
for a in alphas:
    tmp = blend_df.copy()
    tmp['score_blend'] = a*tmp['score'] + (1-a)*tmp['markov_prior'].fillna(0.0)
    met = group_metrics(tmp,  score_col= 'score_blend')
    met['ndcg_full'] = ndcg_full(tmp, score_col='score_blend')
    rows.append({'alpha_lgb': a, **met})

blend_table = pd.DataFrame(rows).sort_values(['top1','ndcg3'], ascending=[False,False]).reset_index(drop=True).round(4)
print("=== Blend sweep (LightGBM α + (1-α)·Markov) ===")
display(blend_table)

best_alpha = float(blend_table.iloc[0]['alpha_lgb'])
print("Best alpha:", best_alpha)
blend_best = blend_df.copy()
blend_best['score'] = best_alpha*blend_best['score'] + (1-best_alpha)*blend_best['markov_prior'].fillna(0.0)


=== Blend sweep (LightGBM α + (1-α)·Markov) ===


Unnamed: 0,alpha_lgb,top1,top2,hit3,mrr3,ndcg3,anchors,ndcg_full
0,0.85,0.7586,0.9793,0.9931,0.8736,0.9048,145,0.9077
1,1.0,0.7517,0.9793,1.0,0.8724,0.9057,145,0.9057
2,0.2,0.731,0.9862,0.9931,0.8609,0.8955,145,0.8984
3,0.3,0.731,0.9862,0.9931,0.8609,0.8955,145,0.8984
4,0.0,0.731,0.9793,0.9931,0.8598,0.8946,145,0.8972
5,0.5,0.731,0.9793,0.9931,0.8598,0.8946,145,0.8975
6,0.7,0.731,0.9793,0.9931,0.8598,0.8946,145,0.8975


Best alpha: 0.85


In [None]:
# Bootstrap confidence intervals for Top-1 (LightGBM & best-blend)

def bootstrap_metric(df, score_col='score', B=1000, seed=0, fn='top1'):
    rng = np.random.default_rng(seed)
    anchors = df['anchor_id'].unique()
    vals = []
    for _ in range(B):
        sample = rng.choice(anchors, size=len(anchors), replace=True)
        sdf = df[df['anchor_id'].isin(sample)]
        m = group_metrics(sdf.rename(columns={score_col:'score'}), 'score')
        vals.append(m[fn])
    arr = np.array(vals, float)
    return float(np.mean(arr)), float(np.percentile(arr, 2.5)), float(np.percentile(arr, 97.5))

mean_t1, lo, hi = bootstrap_metric(hold_lgb, 'score', B=1000, seed=42, fn='top1')
print(f'LightGBM Top-1 bootstrap 95% CI: {mean_t1:.3f} [{lo:.3f}, {hi:.3f}]')

try:
    mean_t1_b, lo_b, hi_b = bootstrap_metric(blend_best, 'score', B=1000, seed=43, fn='top1')
    print(f'Blended Top-1 bootstrap 95% CI: {mean_t1_b:.3f} [{lo_b:.3f}, {hi_b:.3f}]')
except Exception as e:
    print('[Blend CI skipped]', e)
    
# Feature importance (gain/split)
try:
    feat_names = model_lgb.feature_name()
except Exception:
    feat_names = None
if (not feat_names) or any(str(f).startswith("Column_") for f in feat_names):
    feat_names = (cat_cols + ctx_cols + num_cols)

fi_df = pd.DataFrame({
    "feature": feat_names,
    "gain": model_lgb.feature_importance(importance_type="gain"),
    "split": model_lgb.feature_importance(importance_type="split"),
})
fi_df = fi_df.groupby("feature", as_index=False).max().sort_values("gain", ascending=False)
print("=== Top 25 features by gain ===")
display(fi_df.head(25))

LightGBM Top-1 bootstrap 95% CI: 0.752 [0.697, 0.802]
Blended Top-1 bootstrap 95% CI: 0.758 [0.708, 0.812]
=== Top 25 features by gain ===


Unnamed: 0,feature,gain,split
17,markov_prior,185692.484678,36
2,cand_cnt_90d,6013.999537,44
10,cnt_90d,3041.48739,49
16,last_service,2060.696892,20
5,cand_has_hist,1183.922974,2
12,cnt_r_7_90,807.823212,19
13,current,738.314798,12
4,cand_cnt_r_7_90,715.094715,8
0,cand_cnt_30d,445.989912,10
8,cnt_30d,411.881701,9


In [60]:
try:
    import xgboost as xgb

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest  = xgb.DMatrix(X_test,  label=y_test)

    # XGBoost needs group sizes
    dtrain.set_group(groups_train)
    dtest.set_group(groups_test)

    params_xgb = dict(
        objective="rank:pairwise",
        eval_metric="ndcg@3",
        eta=0.05,
        max_depth=8,
        subsample=0.9,
        colsample_bytree=0.9,
        min_child_weight=30,
    )

    model_xgb = xgb.train(
        params_xgb, dtrain,
        num_boost_round=800,
        evals=[(dtest, "valid")],
        early_stopping_rounds=50,
        verbose_eval=50,
    )

    hold_xgb = test_enc.copy()
    for c in list(hold_xgb.columns[hold_xgb.columns == "score_xgb"]):
        hold_xgb = hold_xgb.drop(columns=c)
    hold_xgb["score_xgb"] = model_xgb.predict(dtest, iteration_range=(0, model_xgb.best_iteration + 1))
    # Reuse metrics by renaming score column
    met_xgb = group_metrics(hold_xgb.rename(columns={"score_xgb": "score"}), "score")
    met_xgb["ndcg_full"] = ndcg_full(hold_xgb.rename(columns={"score_xgb": "score"}), "score")
    print("XGBoost:", {k: round(v, 6) if isinstance(v, float) else v for k, v in met_xgb.items()})
except Exception as e:
    print("[XGBoost skipped]", e)

[0]	valid-ndcg@3:0.92003
[50]	valid-ndcg@3:0.90566
XGBoost: {'top1': 0.8, 'top2': 0.97931, 'hit3': 0.993103, 'mrr3': 0.894253, 'ndcg3': 0.90603, 'anchors': 145, 'ndcg_full': 0.90823}
