# Repurchase Within Horizon (Classification) 

**Goal**  
Predict if a user will purchase again within **T ∈ {7, 14, 30, 45}** days.
**Data / Anchors**  
- Window (UTC): 2024-12-24 → 2025-08-04  
- Weekly, calendar-anchored snapshots per user  
- Right-censoring: require ≥T days of future data  
- At-risk gates: {7:0d, 14:0d, 30:7d, 45:14d}

**Label**  
`y(T)=1` if the first purchase after anchor occurs ≤T days; else 0.

**Features (compact)**  
Recency + rolling counts (7/14/30/45d) for key events, last_service, seasonality (month/week + sin/cos), user cadence (inter-purchase mean/median/std).

**Model**  
Per-horizon **LightGBM (binary)** with early stopping; **Platt calibration** on validation.

**Operating thresholds**  
- Default: global F1-optimal per T  
- Optional: **precision-target** (e.g., ≥0.90) or **quota-based** (flag top-q%)  
- JSON thresholds shipped; manual overrides supported

In [1]:
import numpy as np, pandas as pd, json, os, warnings
from pathlib import Path

DATA_DIR = Path('/Users/tree/Projects/tubitak-ai-agent/tubitakaiagentprojeleriiinverisetleri')
OUT_DIR = Path('outputs_b_section')
OUT_DIR.mkdir(parents=True, exist_ok=True)

print('DATA_DIR:', DATA_DIR.resolve())
print('OUT_DIR :', OUT_DIR.resolve())

DATA_DIR: /Users/tree/Projects/tubitak-ai-agent/tubitakaiagentprojeleriiinverisetleri
OUT_DIR : /Users/tree/Projects/tubitak-ai-agent/notebooks/outputs_b_section


In [2]:
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score,
    precision_recall_curve, brier_score_loss, confusion_matrix
)
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb

warnings.filterwarnings(
    "ignore",
    message="Could not infer format, so each element will be parsed individually",
    category=UserWarning
)

UID_CANDIDATES = [
    "user_id","userid","user","uid","customer_id","client_id",
    "ownerid","owner_id","userId","ownerId","customerId","clientId","id"
]
SERV_CANDIDATES = [
    "serviceType","servicetype","service_type","service","type",
    "servicename","service_name","category","segment"
]

def _lower_map(cols):
    return {c.lower(): c for c in cols}

def _lookup(cols, candidates):
    cmap = _lower_map(cols)
    for cand in candidates:
        if cand.lower() in cmap:
            return cmap[cand.lower()]
    return None

def _guess_time_col(df):
    cols = list(df.columns)
    exact = ["purchase_time","event_time","signup_time","created_at","createdAt","timestamp","time","date"]
    for e in exact:
        for c in cols:
            if c.lower() == e.lower():
                return c
    pri = ["purchase","event","signup","created","timestamp","time","date"]
    timey = [c for c in cols if any(p in c.lower() for p in pri)]
    if timey:
        prefer = [c for c in timey if any(c.lower().endswith(suf) for suf in ["_time","time","timestamp","_at"])]
        return prefer[0] if prefer else timey[0]
    return None

def normalize_purchase_df(df):
    df = df.copy()
    uid = _lookup(df.columns, UID_CANDIDATES)
    if uid and uid != "user_id":
        df = df.rename(columns={uid: "user_id"})
    tcol = _guess_time_col(df)
    if tcol and tcol != "purchase_time":
        df = df.rename(columns={tcol: "purchase_time"})
    scol = _lookup(df.columns, SERV_CANDIDATES)
    if scol and scol != "serviceType":
        df = df.rename(columns={scol: "serviceType"})
    if "user_id" not in df.columns:
        raise KeyError(f"Could not detect user_id in Purchase.csv: {list(df.columns)}")
    if "purchase_time" not in df.columns:
        raise KeyError(f"Could not detect purchase_time in Purchase.csv: {list(df.columns)}")
    if "serviceType" not in df.columns:
        df["serviceType"] = "unknown"
    return df

def normalize_event_df(df):
    df = df.copy()
    uid = _lookup(df.columns, UID_CANDIDATES)
    if uid and uid != "user_id":
        df = df.rename(columns={uid: "user_id"})
    tcol = _guess_time_col(df)
    if tcol and tcol != "event_time":
        df = df.rename(columns={tcol: "event_time"})
    scol = _lookup(df.columns, SERV_CANDIDATES)
    if scol and scol != "serviceType":
        df = df.rename(columns={scol: "serviceType"})
    if "user_id" not in df.columns:
        raise KeyError(f"Could not detect user_id in event df: {list(df.columns)}")
    if "event_time" not in df.columns:
        raise KeyError(f"Could not detect event_time in event df: {list(df.columns)}")
    return df

def normalize_signup_df(df):
    df = df.copy()
    uid = _lookup(df.columns, UID_CANDIDATES)
    if uid and uid != "user_id":
        df = df.rename(columns={uid: "user_id"})
    tcol = _lookup(df.columns, ["signup_time","signuptime","signed_up_at","created_at","created","time","timestamp","date"])
    if tcol and tcol != "signup_time":
        df = df.rename(columns={tcol: "signup_time"})
    if "signup_time" not in df.columns:
        df["signup_time"] = pd.NaT
    if "user_id" not in df.columns:
        raise KeyError(f"Could not detect user_id in SignupCompleted.csv: {list(df.columns)}")
    return df

def to_utc(s):
    return pd.to_datetime(s, errors="coerce", utc=True)


In [3]:
PUR_raw = pd.read_csv(DATA_DIR/'Purchase.csv')
SIGN_raw= pd.read_csv(DATA_DIR/'SignupCompleted.csv')
BP_raw  = pd.read_csv(DATA_DIR/'BeforePurchaseDetailsScreen.csv')
CHK_raw = pd.read_csv(DATA_DIR/'CheckoutPageOpened.csv')
ADDR_raw= pd.read_csv(DATA_DIR/'AddressAdded.csv')
CARD_raw= pd.read_csv(DATA_DIR/'CreditcardAdded.csv')
DOG_raw = pd.read_csv(DATA_DIR/'DogAdded.csv')
CAT_raw = pd.read_csv(DATA_DIR/'CatAdded.csv')

PUR  = normalize_purchase_df(PUR_raw)
SIGN = normalize_signup_df(SIGN_raw)
BP   = normalize_event_df(BP_raw)
CHK  = normalize_event_df(CHK_raw)
ADDR = normalize_event_df(ADDR_raw)
CARD = normalize_event_df(CARD_raw)
DOG  = normalize_event_df(DOG_raw)
CAT  = normalize_event_df(CAT_raw)

PUR['purchase_time'] = to_utc(PUR['purchase_time'])
SIGN['signup_time']  = to_utc(SIGN['signup_time'])
for df in (BP, CHK, ADDR, CARD, DOG, CAT):
    df['event_time'] = to_utc(df['event_time'])

def keep_minimal(df):
    df = df.loc[:, ~df.columns.duplicated()]
    cols = [c for c in ['user_id','event_time','serviceType'] if c in df.columns]
    return df[cols].copy()

BP   = keep_minimal(BP)
CHK  = keep_minimal(CHK)
ADDR = keep_minimal(ADDR)
CARD = keep_minimal(CARD)
DOG  = keep_minimal(DOG)
CAT  = keep_minimal(CAT)

PUR = PUR.dropna(subset=['user_id','purchase_time']).sort_values(['user_id','purchase_time'])
print('Purchases:', PUR.shape, '| users:', PUR['user_id'].nunique())
print('Time span:', PUR['purchase_time'].min(), '→', PUR['purchase_time'].max())

Purchases: (28701, 4) | users: 1614
Time span: 2024-12-24 21:41:02.022000+00:00 → 2025-08-04 09:34:52.801000+00:00


In [4]:
def floor_to_week(ts):
    return (ts - pd.to_timedelta(ts.dt.weekday, unit='d')).dt.floor('D')

all_events = [
    PUR[['user_id']].assign(time=PUR['purchase_time']),
    BP[['user_id']].assign(time=BP['event_time']),
    CHK[['user_id']].assign(time=CHK['event_time']),
    ADDR[['user_id']].assign(time=ADDR['event_time']),
    CARD[['user_id']].assign(time=CARD['event_time']),
    DOG[['user_id']].assign(time=DOG['event_time']),
    CAT[['user_id']].assign(time=CAT['event_time']),
]
AE = pd.concat(all_events, ignore_index=True).dropna(subset=['user_id','time']).sort_values(['user_id','time'])

first_seen = AE.groupby('user_id')['time'].min()
last_seen  = AE.groupby('user_id')['time'].max()

print('Activity span:', AE['time'].min(), '→', AE['time'].max())

def user_weeks(u):
    fs = first_seen.loc[u]; le = last_seen.loc[u]
    start = floor_to_week(pd.Series([fs])).iloc[0]
    end   = floor_to_week(pd.Series([le])).iloc[0]
    weeks = pd.date_range(start=start, end=end, freq='W-MON', tz='UTC')
    return pd.DataFrame({'user_id': u, 't': weeks})

anchors = pd.concat([user_weeks(u) for u in first_seen.index], ignore_index=True).sort_values(['user_id','t'])

AE_user = AE.set_index('user_id')
def had_prior(u, t):
    if u not in AE_user.index: return False
    g = AE_user.loc[[u]]
    if isinstance(g, pd.Series):
        return bool(g['time'] < t)
    return bool((g['time'] < t).any())

mask_prior = anchors.apply(lambda r: had_prior(r['user_id'], r['t']), axis=1)
anchors = anchors[mask_prior].copy()
print('Weekly anchors (eligibility met):', len(anchors))

Activity span: 2024-12-24 21:41:02.022000+00:00 → 2025-08-04 11:59:43.172000+00:00
Weekly anchors (eligibility met): 10616


In [5]:
import pandas as pd
def ensure_utc(s: pd.Series) -> pd.Series:
    s = pd.to_datetime(s, errors='coerce')
    if getattr(s.dt, 'tz', None) is None:
        return s.dt.tz_localize('UTC')
    else:
        return s.dt.tz_convert('UTC')

def to_ns(ts: pd.Series) -> np.ndarray:
    return ts.astype('int64').to_numpy()

anchors['t'] = ensure_utc(anchors['t'])
PUR['purchase_time'] = ensure_utc(PUR['purchase_time'])

purch = (
    PUR[['user_id','purchase_time','serviceType']]
      .dropna(subset=['purchase_time'])
      .rename(columns={'serviceType':'last_service'})
      .sort_values(['user_id','purchase_time'])
      .reset_index(drop=True)
)

pt_by_user  = {u: to_ns(g['purchase_time']) for u, g in purch.groupby('user_id', sort=True)}
svc_by_user = {u: g['last_service'].to_numpy() for u, g in purch.groupby('user_id', sort=True)}

anch = anchors[['user_id','t']].copy().sort_values(['user_id','t']).reset_index(drop=True)
t_ns = to_ns(anch['t'])

pt_ns_out = np.empty(len(anch), dtype=np.int64)
svc_out   = np.empty(len(anch), dtype=object)

for u, g in anch.groupby('user_id', sort=False):
    idxs = g.index.to_numpy()
    pt   = pt_by_user.get(u, None)
    if pt is None or pt.size == 0:
        pt_ns_out[idxs] = -1
        svc_out[idxs]   = ''
        continue
    j = np.searchsorted(pt, t_ns[idxs], side='right') - 1
    ok = j >= 0
    svc = np.empty(len(idxs), dtype=object); svc[:] = ''
    svc[ok] = svc_by_user[u][j[ok]]
    out_pt = np.full(len(idxs), -1, dtype=np.int64); out_pt[ok] = pt[j[ok]]
    pt_ns_out[idxs] = out_pt
    svc_out[idxs]   = svc

fe = anch.copy()
fe['pt'] = pd.to_datetime(pt_ns_out, utc=True)
fe['last_service'] = svc_out.astype(str)
fe['recency_days'] = (fe['t'].astype('int64') - fe['pt'].astype('int64')) / 1e9 / 86400.0

print("Built features via numpy-asof: rows=%d, prior-purchase coverage=%.1f%%" % (
    len(fe), 100.0 * (~fe['pt'].isna()).mean()
))

# Rolling windows
def counts_in_windows(events_df, time_col, windows, anchors_df, anchor_time_col='t', key='user_id'):
    """Return a dict: {w: series_of_counts} aligned with anchors_df index, counting events in (t-w, t]."""
    out = {}
    # Pre-sort for efficiency
    events_df = events_df.dropna(subset=[key, time_col]).copy()
    events_df[time_col] = pd.to_datetime(events_df[time_col], errors='coerce', utc=True)
    events_df = events_df.dropna(subset=[time_col]).sort_values([key, time_col])
    # Build index per user for quick slicing
    grouped ={u: g for u, g in events_df.groupby(key)}

    users = anchors_df[key]
    times = pd.to_datetime(anchors_df[anchor_time_col], errors='coerce', utc=True)
    
    out = {}
    for w in windows:
        vals = np.zeros(len(anchors_df), dtype=np.int32)
        delta = pd.Timedelta(days=int(w))
        for i in range(len(anchors_df)):
            u = users.iloc[i]
            t = times.iloc[i]
            g = grouped.get(u)
            if g is None:
                continue
            t0 = t - delta
            mask = (g[time_col] > t0) & (g[time_col] <= t)
            vals[i] = mask.sum()
        out[w] = pd.Series(vals, index=anchors_df.index)
    return out

WINS = [7,14,30,60,90]
p_counts   = counts_in_windows(PUR.rename(columns={'purchase_time':'time'}), 'time', WINS, fe)
bp_counts  = counts_in_windows(BP,   'event_time', WINS, fe)
chk_counts = counts_in_windows(CHK,  'event_time', WINS, fe)
addr_counts= counts_in_windows(ADDR, 'event_time', WINS, fe)
card_counts= counts_in_windows(CARD, 'event_time', WINS, fe)

for w in WINS:
    fe[f'pur_{w}d']  = p_counts[w]
    fe[f'bp_{w}d']   = bp_counts[w]
    fe[f'chk_{w}d']  = chk_counts[w]
    fe[f'addr_{w}d'] = addr_counts[w]
    fe[f'card_{w}d'] = card_counts[w]

for a,b in [(7,90),(30,90)]:
    fe[f'pur_ratio_{a}on{b}'] = (fe[f'pur_{a}d']+1e-3)/(fe[f'pur_{b}d']+1e-3)

SIGN = SIGN.drop_duplicates(subset=['user_id'])
fe = fe.merge(SIGN[['user_id','signup_time']], on='user_id', how='left')
fe['days_since_signup'] = (fe['t'] - fe['signup_time']).dt.total_seconds()/86400.0
fe['dow'] = fe['t'].dt.weekday
fe['hour'] = fe['t'].dt.hour

print('Feature table shape:', fe.shape)


Built features via numpy-asof: rows=10616, prior-purchase coverage=100.0%
Feature table shape: (10616, 36)


In [None]:
# rebuild labels while PRESERVING features from `fe`

import numpy as np
import pandas as pd

HORIZONS = [7, 14, 30, 45]

# Safety: ensure tz-aware
fe['t'] = pd.to_datetime(fe['t'], errors='coerce', utc=True)
PUR['purchase_time'] = pd.to_datetime(PUR['purchase_time'], errors='coerce', utc=True)

# Precompute per-user sorted purchase times (epoch ns)
purch_sorted = (PUR.dropna(subset=['user_id','purchase_time'])
                  .sort_values(['user_id','purchase_time']))
purch_sorted_ns = purch_sorted.assign(p_ns=purch_sorted['purchase_time'].astype('int64'))
p_ns_by_user = {u: g['p_ns'].to_numpy() for u, g in purch_sorted_ns.groupby('user_id', sort=True)}
max_purchase_ns = purch_sorted_ns['p_ns'].max()

# Keep ALL feature columns from fe
fe_sorted = fe.sort_values(['user_id','t']).reset_index(drop=True)
t_ns_all  = fe_sorted['t'].astype('int64').to_numpy()

labels = {}  # rebuilt, with features intact

for T in HORIZONS:
    T_ns = int(T * 86400 * 1e9)

    # Right-censor: only anchors where we could observe a purchase within T
    keep = t_ns_all <= (max_purchase_ns - T_ns)
    sub  = fe_sorted.loc[keep].copy()   # <-- ALL columns preserved

    # Vectorized next-purchase-within-T labeling
    y = np.zeros(len(sub), dtype=np.int8)
    # group indices by user to avoid row loops
    idxs_by_user = sub.groupby('user_id').indices
    t_ns = sub['t'].astype('int64').to_numpy()
    for u, idxs in idxs_by_user.items():
        p_ns = p_ns_by_user.get(u)
        if p_ns is None or p_ns.size == 0:
            continue
        j = np.searchsorted(p_ns, t_ns[idxs], side='right')  # first purchase strictly after t
        valid = j < p_ns.size
        y_u = np.zeros(len(idxs), dtype=np.int8)
        y_u[valid] = ((p_ns[j[valid]] - t_ns[idxs][valid]) <= T_ns).astype(np.int8)
        y[idxs] = y_u

    sub['y'] = y
    labels[T] = sub

    # Diagnostics: confirm features exist
    pref = ['pur_','bp_','chk_','addr_','card_','pur_ratio_','recency','days_since_signup','dow','hour']
    num_cols = [c for c in sub.columns if any(c.startswith(p) for p in pref)]
    print(f"T={T}: n={len(sub):,} | positives={int(y.sum()):,} ({y.mean():.3%}) "
          f"| numeric_feats={len(num_cols)} | has last_service={'last_service' in sub.columns}")

T=7: n=10,577 | positives=4,359 (41.212%) | numeric_feats=31 | has last_service=True
T=14: n=10,065 | positives=5,541 (55.052%) | numeric_feats=31 | has last_service=True
T=30: n=7,609 | positives=5,361 (70.456%) | numeric_feats=31 | has last_service=True
T=45: n=5,702 | positives=4,405 (77.254%) | numeric_feats=31 | has last_service=True


In [7]:
FE_PREFIXES = [
    'pur_', 'bp_', 'chk_', 'addr_', 'card_', 'pur_ratio_', 'recency', 'days_since_signup', 'dow', 'hour'
]
NON_FEATURE_COLS = {'user_id','t','pt','signup_time','y'} 

def detect_feature_cols(df: pd.DataFrame):
    """Return (num_cols, cat_cols) robustly, with fallbacks if prefix scan is empty."""
    num_cols = [c for c in df.columns
                if any(c.startswith(p) for p in FE_PREFIXES)
                and c not in NON_FEATURE_COLS]
    if len(num_cols) == 0:
        num_cols = [c for c in df.columns
                    if c not in NON_FEATURE_COLS
                    and pd.api.types.is_numeric_dtype(df[c])
                    and not pd.api.types.is_categorical_dtype(df[c])]
    cat_cols = ['last_service'] if 'last_service' in df.columns else []
    return num_cols, cat_cols

def month_split(df_sub, time_col='t'):
    mkey = df_sub[time_col].dt.tz_convert('UTC').dt.strftime('%Y-%m')
    months = sorted(mkey.unique())
    if len(months) < 3:
        tmin, tmax = df_sub[time_col].min(), df_sub[time_col].max()
        cut1 = tmin + (tmax - tmin)*0.6
        cut2 = tmin + (tmax - tmin)*0.8
        tr = df_sub[df_sub[time_col] < cut1]
        va = df_sub[(df_sub[time_col] >= cut1) & (df_sub[time_col] < cut2)]
        te = df_sub[df_sub[time_col] >= cut2]
        return tr, va, te, ('percentile', cut1, cut2)
    test_m = months[-1]; val_m = months[-2]; train_m = months[:-2]
    tr = df_sub[mkey.isin(train_m)]
    va = df_sub[mkey == val_m]
    te = df_sub[mkey == test_m]
    return tr, va, te, ('months', train_m, val_m, test_m)

def prepare_xy(df_sub: pd.DataFrame, enc: OrdinalEncoder|None=None):
    num_cols, cat_cols = detect_feature_cols(df_sub)
    print(f"[prepare_xy] num={len(num_cols)} cat={len(cat_cols)} "
          f"examples={len(df_sub)} | first few num: {num_cols[:5]}")

    X_num = df_sub[num_cols].astype(float).fillna(0.0).values if num_cols else np.empty((len(df_sub),0))
    if cat_cols:
        if enc is None:
            enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            X_cat = enc.fit_transform(df_sub[cat_cols].astype(str))
        else:
            X_cat = enc.transform(df_sub[cat_cols].astype(str))
        X = np.hstack([X_num, X_cat]) if X_num.size else X_cat
    else:
        X = X_num
    y = df_sub['y'].astype(int).values
    if X.shape[1] == 0:
        raise ValueError(
            "No features detected. Check your feature columns. "
            f"Available columns: {list(df_sub.columns)}"
        )
    return X, y, enc, (num_cols + cat_cols)

def pick_f1_threshold(y, p):
    if np.unique(y).size < 2:
        return float(np.clip(y.mean(), 0.01, 0.99))
    prec, rec, thr = precision_recall_curve(y, p)
    f1s = 2*prec*rec/(prec+rec+1e-9)
    return float(thr[np.nanargmax(f1s)]) if len(thr) else float(y.mean())

def ece_score(y_true, y_prob, bins=10):
    edges = np.linspace(0,1,bins+1)
    idx = np.digitize(y_prob, edges) - 1
    N = len(y_true); ece = 0.0
    for b in range(bins):
        m = (idx==b)
        if m.sum()==0: continue
        ece += (m.sum()/N) * abs(float(y_prob[m].mean()) - float(y_true[m].mean()))
    return float(ece)

def calibrator_from_dev(y_dev, p_dev):
    if (len(y_dev) >= 200) and (np.unique(y_dev).size==2):
        try:
            iso = IsotonicRegression(out_of_bounds='clip')
            iso.fit(p_dev, y_dev)
            return lambda p: iso.transform(p), 'isotonic'
        except Exception:
            pass
    lr = LogisticRegression(max_iter=1000)
    lr.fit(p_dev.reshape(-1,1), y_dev)
    return lambda p: lr.predict_proba(p.reshape(-1,1))[:,1], 'platt'

def safe_roc(y,p): 
    return float(roc_auc_score(y,p)) if np.unique(y).size==2 else np.nan

In [8]:
HORIZONS = [7,14,30,45]
metrics_rows = []
thresholds = {'global':{}}

for T in HORIZONS:
    sub = labels[T].copy()         # labels[T] contains fe columns + 'y'
    tr, va, te, split_info = month_split(sub, 't')
    print(f"\n=== T={T} split:", split_info, "===")
    X_tr, y_tr, enc, feats = prepare_xy(tr, None)
    X_va, y_va, _, _ = prepare_xy(va, enc)
    X_te, y_te, _, _ = prepare_xy(te, enc)

    # handle imbalance proportionally to val-set (or train); here from train
    pos = y_tr.sum(); neg = len(y_tr) - pos
    spw = float(neg/pos) if pos>0 else 1.0
    print(f"X_tr shape: {X_tr.shape} | X_va: {X_va.shape} | X_te: {X_te.shape}")

    params = dict(
        objective='binary', metric=['auc','average_precision'],
        learning_rate=0.05, num_leaves=63, min_data_in_leaf=50,
        feature_fraction=0.9, bagging_fraction=0.9, bagging_freq=1,
        max_depth=-1, verbose=-1, scale_pos_weight=spw
    )

    # safety: remove forced params
    for k in list(params.keys()):
        if 'forced' in k:
            params.pop(k, None)

    dtr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, free_raw_data=False)
    model = lgb.train(
        params, dtr, num_boost_round=2000,
        valid_sets=[dva], valid_names=['valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)]
    )

    # calibration on validation
    p_va_raw = model.predict(X_va, num_iteration=model.best_iteration)
    calibrate, cal_name = calibrator_from_dev(y_va, p_va_raw)
    p_va = calibrate(p_va_raw)

    # global threshold (F1 on val)
    thr = pick_f1_threshold(y_va, p_va)
    thresholds['global'][str(T)] = float(thr)

    # test
    p_te_raw = model.predict(X_te, num_iteration=model.best_iteration)
    p_te = calibrate(p_te_raw)

    # metrics
    row = dict(
        T=T, split=str(split_info[0]),
        val_pr_auc=float(average_precision_score(y_va, p_va)) if y_va.sum()>0 else 0.0,
        val_roc_auc=safe_roc(y_va, p_va),
        val_brier=float(brier_score_loss(y_va, p_va)),
        val_ece=float(ece_score(y_va, p_va)),
        thr_global=thr,
        test_pr_auc=float(average_precision_score(y_te, p_te)) if y_te.sum()>0 else 0.0,
        test_roc_auc=safe_roc(y_te, p_te),
        test_brier=float(brier_score_loss(y_te, p_te)),
        test_ece=float(ece_score(y_te, p_te)),
        cal=cal_name,
        n_train=len(y_tr), n_val=len(y_va), n_test=len(y_te),
        pos_rate_train=float(y_tr.mean()), pos_rate_val=float(y_va.mean()), pos_rate_test=float(y_te.mean())
    )

    # confusion matrix at global threshold
    y_hat = (p_te >= thr).astype(int)
    cm = confusion_matrix(y_te, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
    f1   = 2*prec*rec/(prec+rec+1e-9) if (prec+rec)>0 else 0.0
    row.update(dict(test_precision=prec, test_recall=rec, test_f1=f1, test_tp=int(tp),
                    test_fp=int(fp), test_fn=int(fn), test_tn=int(tn)))

    print(f"T={T}  thr={thr:.3f}  PR-AUC(val)={row['val_pr_auc']:.3f}  "
          f"PR-AUC(test)={row['test_pr_auc']:.3f}  F1(test)={row['test_f1']:.3f}")
    print("Confusion (test) [rows y_true 0/1, cols y_hat 0/1]:\n", cm)

    # save per-T predictions
    pd.DataFrame({'p_val_raw':p_va_raw, 'p_val':p_va, 'y_val':y_va}).to_csv(OUT_DIR/f'p_hat_val_T{T}.csv', index=False)
    pd.DataFrame({'p_test_raw':p_te_raw, 'p_test':p_te, 'y_test':y_te}).to_csv(OUT_DIR/f'p_hat_test_T{T}.csv', index=False)

    metrics_rows.append(row)

# summary table + thresholds.json
metrics_df = pd.DataFrame(metrics_rows).sort_values('T')
metrics_df.to_csv(OUT_DIR/'repurchase_weekly_propensity_metrics.csv', index=False)
with open(OUT_DIR/'thresholds.json','w') as f:
    json.dump({'global': thresholds['global']}, f, indent=2)

display(metrics_df.round(4))
print('Saved:', OUT_DIR/'repurchase_weekly_propensity_metrics.csv')
print('Saved:', OUT_DIR/'thresholds.json')


=== T=7 split: ('months', ['2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05'], '2025-06', '2025-07') ===
[prepare_xy] num=31 cat=1 examples=2792 | first few num: ['recency_days', 'pur_7d', 'bp_7d', 'chk_7d', 'addr_7d']
[prepare_xy] num=31 cat=1 examples=4817 | first few num: ['recency_days', 'pur_7d', 'bp_7d', 'chk_7d', 'addr_7d']
[prepare_xy] num=31 cat=1 examples=2968 | first few num: ['recency_days', 'pur_7d', 'bp_7d', 'chk_7d', 'addr_7d']
X_tr shape: (2792, 32) | X_va: (4817, 32) | X_te: (2968, 32)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid's auc: 0.775759	valid's average_precision: 0.699733
T=7  thr=0.326  PR-AUC(val)=0.692  PR-AUC(test)=0.755  F1(test)=0.718
Confusion (test) [rows y_true 0/1, cols y_hat 0/1]:
 [[1027  568]
 [ 287 1086]]

=== T=14 split: ('months', ['2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05'], '2025-06', '2025-07') ===
[prepare_xy] num=31 cat=1 examples=2792 | fi

Unnamed: 0,T,split,val_pr_auc,val_roc_auc,val_brier,val_ece,thr_global,test_pr_auc,test_roc_auc,test_brier,...,pos_rate_train,pos_rate_val,pos_rate_test,test_precision,test_recall,test_f1,test_tp,test_fp,test_fn,test_tn
0,7,months,0.6919,0.779,0.1782,0.0,0.3261,0.7548,0.7977,0.1949,...,0.4158,0.3789,0.4626,0.6566,0.791,0.7175,1086,568,287,1027
1,14,months,0.7998,0.7933,0.1831,0.0,0.4286,0.8548,0.8292,0.1733,...,0.5534,0.5288,0.59,0.7833,0.8357,0.8087,1211,335,238,672
2,30,months,0.8616,0.7156,0.1543,0.0,0.6025,0.829,0.7422,0.1821,...,0.4836,0.7701,0.6919,0.6919,1.0,0.8179,3333,1484,0,0
3,45,months,0.8441,0.5717,0.1449,0.0,0.7936,0.7762,0.5547,0.1869,...,0.6432,0.8195,0.7533,0.7533,1.0,0.8593,2192,718,0,0


Saved: outputs_b_section/repurchase_weekly_propensity_metrics.csv
Saved: outputs_b_section/thresholds.json


In [None]:
# Config, at-risk gate, feature enrichment, rebuild labels ===
import numpy as np
import pandas as pd
from pathlib import Path

# Horizons & at-risk gates (days of inactivity required at anchor)
HORIZONS   = [7, 14, 30, 45]
AT_RISK_MIN = {7: 0, 14: 0, 30: 7, 45: 14}   # tweak as needed

# Preconditions / safety checks 
req_fe_cols  = {'user_id','t','recency_days'}
req_pur_cols = {'user_id','purchase_time','serviceType'}
if not req_fe_cols.issubset(fe.columns):
    raise KeyError(f"`fe` must contain {sorted(req_fe_cols)}; found: {list(fe.columns)}")
if not req_pur_cols.issubset(PUR.columns):
    raise KeyError(f"`PUR` must contain {sorted(req_pur_cols)}; found: {list(PUR.columns)}")

# Normalize dtypes & timezone awareness 
fe = fe.copy()
PUR = PUR.copy()
fe['user_id']  = fe['user_id'].astype(str)
PUR['user_id'] = PUR['user_id'].astype(str)
fe['t'] = pd.to_datetime(fe['t'], utc=True, errors='coerce')
PUR['purchase_time'] = pd.to_datetime(PUR['purchase_time'], utc=True, errors='coerce')

# de-dup per (user, timestamp, serviceType)
PUR = PUR.drop_duplicates(subset=['user_id','purchase_time','serviceType'], keep='first')


# 1) Cadence features per user (robust; no arrays)
p = (PUR.dropna(subset=['user_id','purchase_time'])
        .sort_values(['user_id','purchase_time'], kind='mergesort'))
# Interpurchase interval (days) – NaN for each user's first purchase
p['ip_days'] = p.groupby('user_id')['purchase_time'].diff().dt.total_seconds() / 86400.0

# Aggregate over actual intervals (exclude NaNs)
cad = (p.dropna(subset=['ip_days'])
         .groupby('user_id', sort=True)['ip_days']
         .agg(ip_mean_days='mean',
              ip_std_days='std',
              ip_median_days='median',
              ip_min_days='min',
              ip_max_days='max',
              ip_count='count')
         .reset_index())

# Ensure all users are present (fill users with <2 purchases)
all_users = pd.DataFrame({'user_id': pd.Index(
    sorted(set(fe['user_id']) | set(PUR['user_id']))
)})
cad = (all_users.merge(cad, on='user_id', how='left')
               .fillna({'ip_mean_days': 0.0,
                        'ip_std_days': 0.0,
                        'ip_median_days': 0.0,
                        'ip_min_days': 0.0,
                        'ip_max_days': 0.0,
                        'ip_count': 0}))

# 2) Lifetime service diversity: #services + entropy
svc_counts = (PUR.groupby(['user_id','serviceType'])
                .size().rename('cnt').reset_index())
if not svc_counts.empty:
    svc_total  = svc_counts.groupby('user_id')['cnt'].sum().rename('tot')
    svc_probs  = svc_counts.merge(svc_total, on='user_id')
    svc_probs['p'] = svc_probs['cnt'] / svc_probs['tot'].replace(0, np.nan)
    # entropy = - Σ p log p
    svc_entropy = (svc_probs.assign(h=lambda r: -(r['p'] * np.log(r['p'] + 1e-12)))
                            .groupby('user_id', sort=True)['h'].sum()
                            .rename('svc_entropy').reset_index())
    svc_nuniq   = (svc_counts.groupby('user_id', sort=True).size()
                               .rename('n_services_lifetime').reset_index())
else:
    svc_entropy = pd.DataFrame({'user_id': all_users['user_id'], 'svc_entropy': 0.0})
    svc_nuniq   = pd.DataFrame({'user_id': all_users['user_id'], 'n_services_lifetime': 0})

# Combine cadence + diversity
cad_all = (cad.merge(svc_entropy, on='user_id', how='left')
              .merge(svc_nuniq,   on='user_id', how='left')
              .fillna({'svc_entropy': 0.0, 'n_services_lifetime': 0}))

# 3) Seasonality features (at anchor time t)
fe['month']      = fe['t'].dt.month.astype(int)
fe['weekofyear'] = fe['t'].dt.isocalendar().week.astype(int)
fe['month_sin']  = np.sin(2*np.pi*(fe['month']/12.0))
fe['month_cos']  = np.cos(2*np.pi*(fe['month']/12.0))
fe['woy_sin']    = np.sin(2*np.pi*(fe['weekofyear']/52.0))
fe['woy_cos']    = np.cos(2*np.pi*(fe['weekofyear']/52.0))

# Merge all user-level features into fe
before_cols = set(fe.columns)
fe = fe.merge(cad_all, on='user_id', how='left')
added_cols = [c for c in fe.columns if c not in before_cols]
print(f"[enrichment] Added columns: {added_cols}")
print(f"[enrichment] fe shape: {fe.shape}")

# 4) Rebuild labels (vectorized) for “purchase within T days”
# Right-censor anchors that don't have T days of future data
# Apply at-risk gate per T: recency_days ≥ AT_RISK_MIN[T]
# Precompute per-user purchase timestamps (ns)
p_ns_by_user = {
    u: g['purchase_time'].astype('int64').to_numpy()
    for u, g in PUR.sort_values(['user_id','purchase_time'], kind='mergesort') \
                   .groupby('user_id', sort=True)
}

fe_sorted = fe.sort_values(['user_id','t'], kind='mergesort').reset_index(drop=True)
t_all_ns  = fe_sorted['t'].astype('int64').to_numpy()
max_pt_ns = int(PUR['purchase_time'].astype('int64').max())

labels = {}
for T in HORIZONS:
    T_ns = int(T * 86400 * 1e9)  # days -> ns
    # Right-censor: require enough future horizon
    keep = t_all_ns <= (max_pt_ns - T_ns)
    sub  = fe_sorted.loc[keep].copy()

    # At-risk gate by recency
    gate = int(AT_RISK_MIN.get(T, 0))
    if gate > 0:
        if 'recency_days' not in sub.columns:
            raise KeyError("`recency_days` missing in fe; required for at-risk gate.")
        sub = sub[sub['recency_days'] >= gate].copy()

    # Vectorized label construction per user
    y = np.zeros(len(sub), dtype=np.int8)
    t_ns = sub['t'].astype('int64').to_numpy()
    idxs_by_user = sub.groupby('user_id', sort=True).indices # {user: [idxs in sub]}
    for u, idxs in idxs_by_user.items():
        pts = p_ns_by_user.get(u)
        if pts is None or pts.size == 0:
            continue
        # index of first purchase strictly after anchor time
        j = np.searchsorted(pts, t_ns[idxs], side='right')
        ok = j < pts.size
        # purchase occurs within T days?
        y_u = np.zeros(len(idxs), dtype=np.int8)
        y_u[ok] = ((pts[j[ok]] - t_ns[idxs][ok]) <= T_ns).astype(np.int8)
        y[idxs] = y_u

    sub['y'] = y
    labels[T] = sub

    print(f"[labels] T={T}: n={len(sub):,} | positives={int(y.sum()):,} "
          f"({y.mean():.3%}) | at_risk≥{AT_RISK_MIN[T]}d | feats={sub.shape[1]-3} (excl user_id,t,y)")

[enrichment] Added columns: ['ip_mean_days', 'ip_std_days', 'ip_median_days', 'ip_min_days', 'ip_max_days', 'ip_count', 'svc_entropy', 'n_services_lifetime']
[enrichment] fe shape: (10616, 50)
[labels] T=7: n=10,577 | positives=4,359 (41.212%) | at_risk≥0d | feats=48 (excl user_id,t,y)
[labels] T=14: n=10,065 | positives=5,541 (55.052%) | at_risk≥0d | feats=48 (excl user_id,t,y)
[labels] T=30: n=4,324 | positives=2,439 (56.406%) | at_risk≥7d | feats=48 (excl user_id,t,y)
[labels] T=45: n=2,170 | positives=1,221 (56.267%) | at_risk≥14d | feats=48 (excl user_id,t,y)


In [None]:
# === Manual thresholds per horizon (
import json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, brier_score_loss

# manual thresholds per horizon (tune based on business needs)
HORIZONS   = [7, 14, 30, 45]
MANUAL_THR = {7: 0.48, 14: 0.49, 30: 0.63, 45: 0.9}  

#directories
DIRS_TRY   = [Path('outputs_b_section'),
              Path('outputs_b_section')]
PRED_DIR   = next((d for d in DIRS_TRY if d.exists()), DIRS_TRY[-1])
OUT_DIR    = Path('outputs_b_section'); OUT_DIR.mkdir(parents=True, exist_ok=True)

def _load_preds(T: int):
    # Try CatBoost file first, then LightGBM
    cat = PRED_DIR / f'p_hat_test_T{T}.cat.csv'
    lgb = PRED_DIR / f'p_hat_test_T{T}.csv'
    if cat.exists():
        df = pd.read_csv(cat)
        y = df['y_test'].astype(int).to_numpy()
        p = df['p_test'].astype(float).to_numpy()
        src = 'cat'
    elif lgb.exists():
        df = pd.read_csv(lgb)
        y = df['y_test'].astype(int).to_numpy()
        p = df['p_test'].astype(float).to_numpy()
        src = 'lgb'
    else:
        raise FileNotFoundError(f"No predictions found for T={T} in {PRED_DIR}")
    return y, p, df, src

def _month_split(df_sub, time_col='t'):
    mkey = df_sub[time_col].dt.tz_convert('UTC').dt.strftime('%Y-%m')
    months = sorted(mkey.unique())
    if len(months) < 3:
        tmin, tmax = df_sub[time_col].min(), df_sub[time_col].max()
        cut1 = tmin + (tmax - tmin)*0.6
        cut2 = tmin + (tmax - tmin)*0.8
        tr = df_sub[df_sub[time_col] < cut1]
        va = df_sub[(df_sub[time_col] >= cut1) & (df_sub[time_col] < cut2)]
        te = df_sub[df_sub[time_col] >= cut2]
        return tr, va, te
    test_m = months[-1]; val_m = months[-2]; train_m = months[:-2]
    tr = df_sub[mkey.isin(train_m)]
    va = df_sub[mkey == val_m]
    te = df_sub[mkey == test_m]
    return tr, va, te

def _confusion_at(y, p, thr):
    yhat = (p >= thr).astype(int)
    cm = confusion_matrix(y, yhat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
    f1   = 2*prec*rec/(prec+rec+1e-9) if (prec+rec)>0 else 0.0
    spec = tn/(tn+fp) if (tn+fp)>0 else 0.0
    return cm, dict(precision=prec, recall=rec, f1=f1, specificity=spec,
                    tp=int(tp), fp=int(fp), fn=int(fn), tn=int(tn),
                    flagged=int(yhat.sum()), n=len(y), flagged_rate=float((yhat==1).mean()))

rows = []
for T in HORIZONS:
    thr = float(MANUAL_THR[T])
    y, p, _pred, src = _load_preds(T)

    # attach ids & times if labels[T] is present in the notebook
    try:
        sub = labels[T].copy()
        _, _, te = _month_split(sub, 't')
        te = te.reset_index(drop=True)
        if len(te) != len(y):
            print(f"[warn] T={T}: test slice length {len(te)} != preds {len(y)}; writing without ids/t.")
            te = None
    except Exception:
        te = None

    cm, m = _confusion_at(y, p, thr)
    ap = float(average_precision_score(y, p))
    roc = float(roc_auc_score(y, p)) if np.unique(y).size==2 else np.nan
    brier = float(brier_score_loss(y, p))

    print(f"\n=== T={T} | model={src} | manual thr={thr:.3f} ===")
    print("Confusion [rows y_true 0/1, cols y_hat 0/1]:\n", cm)
    print({k: (round(v,3) if isinstance(v,float) else v) for k,v in m.items()},
          "| base_rate=", round(y.mean(),3), "| AP=", round(ap,3),
          "| ROC=", (round(roc,3) if roc==roc else 'nan'), "| Brier=", round(brier,3))

    # save flagged list
    if te is not None:
        out_df = te[['user_id','t']].copy()
    else:
        out_df = pd.DataFrame({'user_id': np.nan, 't': np.nan}, index=range(len(y)))
    out_df['p'] = p
    out_df['y'] = y
    out_df['flag'] = (p >= thr).astype(int)
    out_path = OUT_DIR / f'flagged_test_T{T}_manual_thr{thr:.3f}.{src}.csv'
    out_df.to_csv(out_path, index=False)
    print("Saved flagged:", out_path)

    rows.append({'T':T, 'model':src, 'threshold':thr, **m,
                 'base_rate': float(y.mean()), 'ap_test': ap, 'roc_test': roc, 'brier_test': brier})

# summary & persist thresholds (as global)
summary = pd.DataFrame(rows).sort_values('T')
display(summary.round(4))

(PRED_DIR/'thresholds_operational.json').write_text(json.dumps(
    {'global': {str(T): float(MANUAL_THR[T]) for T in HORIZONS},
     'note': 'manual thresholds chosen by analyst'},
    indent=2
))
print("Updated:", PRED_DIR/'thresholds_operational.json')


=== T=7 | model=lgb | manual thr=0.480 ===
Confusion [rows y_true 0/1, cols y_hat 0/1]:
 [[1465  130]
 [ 809  564]]
{'precision': np.float64(0.813), 'recall': np.float64(0.411), 'f1': np.float64(0.546), 'specificity': np.float64(0.918), 'tp': 564, 'fp': 130, 'fn': 809, 'tn': 1465, 'flagged': 694, 'n': 2968, 'flagged_rate': 0.234} | base_rate= 0.463 | AP= 0.755 | ROC= 0.798 | Brier= 0.195
Saved flagged: outputs_b_section/flagged_test_T7_manual_thr0.480.lgb.csv

=== T=14 | model=lgb | manual thr=0.490 ===
Confusion [rows y_true 0/1, cols y_hat 0/1]:
 [[ 713  294]
 [ 283 1166]]
{'precision': np.float64(0.799), 'recall': np.float64(0.805), 'f1': np.float64(0.802), 'specificity': np.float64(0.708), 'tp': 1166, 'fp': 294, 'fn': 283, 'tn': 713, 'flagged': 1460, 'n': 2456, 'flagged_rate': 0.594} | base_rate= 0.59 | AP= 0.855 | ROC= 0.829 | Brier= 0.173
Saved flagged: outputs_b_section/flagged_test_T14_manual_thr0.490.lgb.csv
[warn] T=30: test slice length 2989 != preds 4817; writing without i

Unnamed: 0,T,model,threshold,precision,recall,f1,specificity,tp,fp,fn,tn,flagged,n,flagged_rate,base_rate,ap_test,roc_test,brier_test
0,7,lgb,0.48,0.8127,0.4108,0.5457,0.9185,564,130,809,1465,694,2968,0.2338,0.4626,0.7548,0.7977,0.1949
1,14,lgb,0.49,0.7986,0.8047,0.8017,0.708,1166,294,283,713,1460,2456,0.5945,0.59,0.8548,0.8292,0.1733
2,30,lgb,0.63,0.8611,0.6994,0.7719,0.7466,2331,376,1002,1108,2707,4817,0.562,0.6919,0.829,0.7422,0.1821
3,45,lgb,0.9,0.8795,0.1898,0.3122,0.9206,416,57,1776,661,473,2910,0.1625,0.7533,0.7762,0.5547,0.1869


Updated: outputs_b_section/thresholds_operational.json
