
# Repurchase & Next-Purchase — v3 (No Quality Checks)

This version **removes all quality checks** (no status/amount filters, no burst-dedup).  
It keeps:

- **Extended taxonomy** (Walk/ Sitting + WalkAndCare, Grooming; case-insensitive aliases)
- **Repurchase & inter-purchase intervals**
- **Next-purchase predictions** (recency-weighted transitions + personalized shrinkage)
- **Time-based model validation** (Top-1 / Hit@K / MRR@K)


In [1]:
# === Config ===
from pathlib import Path

# Where your CSVs live
DATA_DIR = Path('/Users/tree/Projects/tubitak-ai-agent/tubitakaiagentprojeleriiinverisetleri')  # e.g., Path('/mnt/data')

# File/column assumptions
PURCHASE_FILE = 'Purchase.csv'
TIME_COL = 'ordercreatedtime'
USER_ID_CANDIDATES = ['ownerid','user_id','id']
SERVICETYPE_CANDS = ['serviceType','servicetype','service_type']

# Extended taxonomy (lowercased)
WALK_SET = {
    'adhoc','planned','package','customize','walking','walk',
    'walkandcare','walk & care','walk_and_care'
}
SITTING_SET = {
    'boarding','sitting','catboarding','catsitting'
}
OTHER_KNOWN = {'grooming'}  # will map to 'Other' category

# Modeling knobs
CHURN_THRESHOLD_DAYS = 90
HALFLIFE_DAYS = 90          # recency half-life for transition weights
PERSONALIZATION_M = 5       # shrinkage toward global
TOPK = 3                    # top-k predictions

# Validation split
HOLDOUT_DAYS = 30           # last 30 days = test set

In [2]:
# === Helpers ===
import pandas as pd
import numpy as np

def coalesce_id(df, candidates):
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series([np.nan]*len(df))

def parse_time_utc(s, local_tz='Europe/Istanbul'):
    ts = pd.to_datetime(s, errors='coerce', utc=True, infer_datetime_format=True)
    if getattr(ts.dt, 'tz', None) is None:
        ts = pd.to_datetime(s, errors='coerce')
        ts = ts.dt.tz_localize(local_tz, ambiguous='NaT', nonexistent='NaT').dt.tz_convert('UTC')
    return ts

def norm_service(series: pd.Series) -> pd.Series:
    out = series.astype(str).str.strip().str.lower()
    out = out.replace({'nan': np.nan, 'none': np.nan, 'nat': np.nan, '': np.nan})
    out = out.replace({'walk & care': 'walkandcare', 'walk_and_care': 'walkandcare'})
    return out

def map_category(st: str) -> str:
    if pd.isna(st): return np.nan
    s = str(st).lower()
    if s in WALK_SET: return 'Walk'
    if s in SITTING_SET: return 'Sitting'
    return 'Other'

def days_between(t1, t0):
    return (t1 - t0).dt.total_seconds()/(24*3600)

def recency_weight(age_days, halflife_days):
    return np.power(0.5, np.clip(age_days, 0, None) / max(halflife_days, 1e-9))

def excel_safe(df, to_tz='Europe/Istanbul'):
    out = df.copy()
    for col in out.columns:
        if pd.api.types.is_datetime64tz_dtype(out[col]):
            out[col] = out[col].dt.tz_convert(to_tz).dt.tz_localize(None)
    return out


In [3]:
# === Load purchases (no quality filters) ===
from pathlib import Path

p = pd.read_csv(Path(DATA_DIR) / PURCHASE_FILE)

# coalesce id + time
p['user_id'] = coalesce_id(p, USER_ID_CANDIDATES).astype(str).replace({'nan': np.nan})
if TIME_COL not in p.columns:
    raise KeyError(f"Time column '{TIME_COL}' not found in {PURCHASE_FILE}")
p['purchase_time'] = parse_time_utc(p[TIME_COL])

# serviceType normalize & categorize
st_col = None
for c in SERVICETYPE_CANDS:
    if c in p.columns:
        st_col = c; break
p['serviceType_raw'] = p[st_col] if st_col else np.nan
p['serviceType'] = norm_service(p['serviceType_raw'])
p['category'] = p['serviceType'].apply(map_category)

# drop missing criticals
p = p.dropna(subset=['user_id','purchase_time']).copy()
p['user_id'] = p['user_id'].astype(str)

# sort
p = p.sort_values(['user_id','purchase_time']).reset_index(drop=True)

print('Rows:', len(p), '| Users:', p['user_id'].nunique(), '| Time range:', p['purchase_time'].min(), '→', p['purchase_time'].max())
p.head()

Rows: 28701 | Users: 1614 | Time range: 2024-12-24 21:41:02.022000+00:00 → 2025-08-04 09:34:52.801000+00:00


  ts = pd.to_datetime(s, errors='coerce', utc=True, infer_datetime_format=True)


Unnamed: 0,serviceid,ownerid,ordercreatedtime,servicetype,user_id,purchase_time,serviceType_raw,serviceType,category
0,d2ca43d6-737f-4bd6-9482-3149127453da,005fc863-7c9c-42ce-af56-38fed3c545f3,2025-05-14 18:20:44.706000,AdHoc,005fc863-7c9c-42ce-af56-38fed3c545f3,2025-05-14 18:20:44.706000+00:00,AdHoc,adhoc,Walk
1,3a2b4eeb-7366-4ab8-b64f-da683dc22a6e,005fc863-7c9c-42ce-af56-38fed3c545f3,2025-05-15 18:58:14.754000,AdHoc,005fc863-7c9c-42ce-af56-38fed3c545f3,2025-05-15 18:58:14.754000+00:00,AdHoc,adhoc,Walk
2,2a88a44d-7e9b-4c58-9eec-44e7c50d3d0b,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-05-23 18:28:44.253000,AdHoc,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-05-23 18:28:44.253000+00:00,AdHoc,adhoc,Walk
3,77dfb189-b068-4034-a94d-9f62db799930,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-06-11 17:34:34.581000,AdHoc,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-06-11 17:34:34.581000+00:00,AdHoc,adhoc,Walk
4,0e3f614f-dde4-49e9-b297-f295033dc2c4,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-06-24 17:06:10.411000,AdHoc,00bc3aed-8e44-4a3f-8a87-c24e9c72106f,2025-06-24 17:06:10.411000+00:00,AdHoc,adhoc,Walk


In [4]:
# === Repurchase & intervals (raw) ===
user_counts = p.groupby('user_id').size().rename('purchase_count')
buyers = int((user_counts >= 1).sum())
repeat_buyers = int((user_counts >= 2).sum())
repurchase_rate = repeat_buyers / buyers if buyers else np.nan
repurchase_summary = pd.DataFrame([{'buyers': buyers, 'repeat_buyers': repeat_buyers, 'repurchase_rate': repurchase_rate}])

p['prev_time'] = p.groupby('user_id')['purchase_time'].shift(1)
p['delta_days'] = (p['purchase_time'] - p['prev_time']).dt.total_seconds()/(24*3600)
intervals = p.dropna(subset=['delta_days']).copy()

def statblock(s):
    s = s.dropna()
    return pd.Series({
        'count': int(s.size),
        'mean_days': float(s.mean()) if s.size else np.nan,
        'median_days': float(s.median()) if s.size else np.nan,
        'p25_days': float(s.quantile(0.25)) if s.size else np.nan,
        'p75_days': float(s.quantile(0.75)) if s.size else np.nan,
        'min_days': float(s.min()) if s.size else np.nan,
        'max_days': float(s.max()) if s.size else np.nan,
    })

overall_interval_stats = statblock(intervals['delta_days']).to_frame().T
overall_interval_stats.insert(0, 'scope', 'all_intervals')

last_purchase = p.groupby('user_id').tail(1).copy()
second_last = p.groupby('user_id').nth(-2).reset_index()
last_interval = last_purchase.merge(second_last[['user_id','purchase_time']].rename(columns={'purchase_time':'prev_purchase_time'}),
                                    on='user_id', how='left')
last_interval['last_delta_days'] = (last_interval['purchase_time'] - last_interval['prev_purchase_time']).dt.total_seconds()/(24*3600)
last_interval = last_interval.dropna(subset=['last_delta_days'])
last_interval_stats = statblock(last_interval['last_delta_days']).to_frame().T
last_interval_stats.insert(0, 'scope', 'last_interval_per_user')

interval_stats = pd.concat([overall_interval_stats, last_interval_stats], ignore_index=True)

repurchase_summary, interval_stats.head()

(   buyers  repeat_buyers  repurchase_rate
 0    1614           1279         0.792441,
                     scope    count  mean_days  median_days  p25_days  \
 0           all_intervals  27087.0   2.022260     0.000000  0.000000   
 1  last_interval_per_user   1279.0   6.682605     1.160648  0.000841   
 
    p75_days  min_days    max_days  
 0  1.023671       0.0  137.020692  
 1  7.860352       0.0   84.976695  )

In [5]:
# === Train/Test split & transitions ===
max_time = p['purchase_time'].max()
train_end = max_time - pd.Timedelta(days=HOLDOUT_DAYS)
train = p[p['purchase_time'] <= train_end].copy()
test  = p[p['purchase_time'] >  train_end].copy()

print("Train range:", train['purchase_time'].min(), "→", train['purchase_time'].max())
print("Test  range:", test['purchase_time'].min(), "→", test['purchase_time'].max())

def make_transitions(df):
    d = df.copy()
    d['next_service'] = d.groupby('user_id')['serviceType'].shift(-1)
    d['this_time'] = d['purchase_time']
    d['next_time'] = d.groupby('user_id')['purchase_time'].shift(-1)
    return d.dropna(subset=['serviceType','next_service','this_time','next_time']).copy()

train_t = make_transitions(train)
test_t  = make_transitions(test)

# Recency-weighted prior from TRAIN
train_t['age_days'] = (train['purchase_time'].max() - train_t['this_time']).dt.total_seconds()/(24*3600)
train_t['w'] = recency_weight(train_t['age_days'], HALFLIFE_DAYS)
rw_counts = (train_t.groupby(['serviceType','next_service'])['w'].sum()
             .rename('w_count').reset_index())
rw_totals = rw_counts.groupby('serviceType')['w_count'].transform('sum')
rw_counts['prob'] = np.where(rw_totals > 0, rw_counts['w_count'] / rw_totals, np.nan)

# User-specific counts from TRAIN (recency vs user's last train time)
last_time_user = train.groupby('user_id')['purchase_time'].max().to_dict()
d = train_t.copy()
d['age_days_user'] = d.apply(lambda r: (last_time_user.get(r['user_id']) - r['this_time']).total_seconds()/(24*3600), axis=1)
d['uw'] = recency_weight(d['age_days_user'], HALFLIFE_DAYS)

from collections import defaultdict
user_train_counts = {}
for (uid, cur), grp in d.groupby(['user_id','serviceType']):
    user_train_counts[(uid, cur)] = grp.groupby('next_service')['uw'].sum().to_dict()

# Global fallback list (most common in TRAIN)
global_order = train['serviceType'].value_counts(dropna=True).index.tolist()

glob_prior = defaultdict(dict)
for _, r in rw_counts.iterrows():
    glob_prior[r['serviceType']][r['next_service']] = r['prob']
if not glob_prior:
    tmp = train_t.groupby(['serviceType','next_service']).size().rename('count').reset_index()
    tot = tmp.groupby('serviceType')['count'].transform('sum')
    tmp['prob'] = tmp['count'] / tot
    for _, r in tmp.iterrows():
        glob_prior[r['serviceType']][r['next_service']] = r['prob']

Train range: 2024-12-24 21:41:02.022000+00:00 → 2025-07-05 09:33:34.610000+00:00
Test  range: 2025-07-05 09:52:18.412000+00:00 → 2025-08-04 09:34:52.801000+00:00


In [6]:
# === Prediction & Validation ===
def predict_next(current_service, user_id, m=PERSONALIZATION_M, topk=TOPK):
    prior = glob_prior.get(current_service, {})
    u = user_train_counts.get((user_id, current_service), {})
    sum_u = sum(u.values()) if u else 0.0
    cand = set(prior.keys()) | set(u.keys()) | set(global_order[:topk])
    scores = []
    for nxt in cand:
        pu = u.get(nxt, 0.0)
        pg = prior.get(nxt, np.nan)
        if np.isnan(pg):
            pg = 1.0 / max(len(cand), 1)  # uniform fallback
        post = (pu + m * pg) / (sum_u + m if (sum_u + m) > 0 else 1.0)
        scores.append((nxt, float(post)))
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:topk]
    return ranked

def eval_metrics(test_t, topk=TOPK):
    rows = []
    for _, r in test_t.iterrows():
        cur, nxt, uid = r['serviceType'], r['next_service'], r['user_id']
        ranked = predict_next(cur, uid, topk=topk)
        pred_list = [s for s,_ in ranked]
        top1 = 1.0 if (len(pred_list)>0 and pred_list[0]==nxt) else 0.0
        hitk = 1.0 if nxt in pred_list else 0.0
        rr = 0.0
        if nxt in pred_list:
            rr = 1.0 / (pred_list.index(nxt) + 1)
        rows.append({'current': cur, 'actual_next': nxt, 'top1': top1, f'hit@{topk}': hitk, f'mrr@{topk}': rr})
    df = pd.DataFrame(rows)
    overall = df[['top1', f'hit@{topk}', f'mrr@{topk}']].mean().to_frame('value').reset_index().rename(columns={'index':'metric'})
    by_cur = df.groupby('current')[['top1', f'hit@{topk}', f'mrr@{topk}']].mean().reset_index()
    return df, overall, by_cur

test_preds, overall_metrics, by_current = eval_metrics(test_t, topk=TOPK)

# Simple baseline (global top-k from TRAIN)
def baseline_eval(test_t, topk=TOPK):
    base_list = global_order[:topk]
    rows = []
    for _, r in test_t.iterrows():
        nxt = r['next_service']
        top1 = 1.0 if (len(base_list) > 0 and base_list[0] == nxt) else 0.0
        hitk = 1.0 if nxt in base_list else 0.0
        rr = 0.0
        if nxt in base_list:
            rr = 1.0 / (base_list.index(nxt) + 1)
        rows.append({'top1': top1, f'hit@{topk}': hitk, f'mrr@{topk}': rr})
    df = pd.DataFrame(rows)
    overall = df.mean().to_frame('value').reset_index().rename(columns={'index':'metric'})
    return overall

baseline_metrics = baseline_eval(test_t, topk=TOPK)

overall_metrics, baseline_metrics.head()

(  metric     value
 0   top1  0.879590
 1  hit@3  0.992801
 2  mrr@3  0.933855,
   metric     value
 0   top1  0.438625
 1  hit@3  0.940425
 2  mrr@3  0.656857)

In [7]:
# === Save outputs ===
OUT_DIR = Path(DATA_DIR) / 'repurchase_outputs_v3'
OUT_DIR.mkdir(exist_ok=True)

excel_engine = None
try:
    import xlsxwriter  # noqa
    excel_engine = 'xlsxwriter'
except ImportError:
    try:
        import openpyxl  # noqa
        excel_engine = 'openpyxl'
    except ImportError:
        excel_engine = None

# CSVs
excel_safe(repurchase_summary).to_csv(OUT_DIR / 'repurchase_summary.csv', index=False)
excel_safe(intervals[['user_id','prev_time','purchase_time','delta_days','serviceType','category']]).to_csv(OUT_DIR / 'interpurchase_intervals.csv', index=False)
excel_safe(pd.concat([overall_interval_stats, last_interval_stats], ignore_index=True)).to_csv(OUT_DIR / 'interpurchase_interval_stats.csv', index=False)
excel_safe(rw_counts).to_csv(OUT_DIR / 'train_recency_weighted_transitions.csv', index=False)
excel_safe(test_preds).to_csv(OUT_DIR / 'validation_predictions_detail.csv', index=False)
excel_safe(overall_metrics).to_csv(OUT_DIR / 'validation_metrics_overall.csv', index=False)
excel_safe(by_current).to_csv(OUT_DIR / 'validation_metrics_by_current.csv', index=False)

# Excel pack (optional)
if excel_engine:
    xls = OUT_DIR / 'repurchase_validation_report.xlsx'
    with pd.ExcelWriter(xls, engine=excel_engine) as writer:
        excel_safe(repurchase_summary).to_excel(writer, sheet_name='01_repurchase', index=False)
        excel_safe(pd.concat([overall_interval_stats, last_interval_stats], ignore_index=True)).to_excel(writer, sheet_name='02_intervals', index=False)
        excel_safe(rw_counts).to_excel(writer, sheet_name='03_transitions_train', index=False)
        excel_safe(overall_metrics).to_excel(writer, sheet_name='04_valid_overall', index=False)
        excel_safe(by_current).to_excel(writer, sheet_name='05_valid_by_current', index=False)
        excel_safe(test_preds.head(5000)).to_excel(writer, sheet_name='06_prediction_samples', index=False)
    print("Excel report:", xls)
else:
    print("No Excel engine detected; wrote CSVs only.")

  if pd.api.types.is_datetime64tz_dtype(out[col]):
  if pd.api.types.is_datetime64tz_dtype(out[col]):


Excel report: /Users/tree/Projects/tubitak-ai-agent/tubitakaiagentprojeleriiinverisetleri/repurchase_outputs_v3/repurchase_validation_report.xlsx
