In [1]:
# =============================================================================
# HUMOB / SIGSPATIAL Cup 2025
# Corrected Pipeline: HDBSCAN (Scalable DBSCAN) + City-A Memory Fix
# - Fixes NameError in final print statement
# =============================================================================

# ----------------------------
# Requirements
# ----------------------------
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm scikit-learn matplotlib pandas hdbscan

# ----------------------------
# Imports
# ----------------------------
import os, gc, json, time, random
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
# NEW: Import HDBSCAN
from hdbscan import HDBSCAN

# geobleu
try:
    from geobleu import calc_geobleu_single, calc_geobleu_bulk
except Exception:
    from geobleu import calc_geobleu_single
    calc_geobleu_bulk = None

# ----------------------------
# Global configs
# ----------------------------
DATA_DIR = "/kaggle/input/humob-data/15313913"   # Kaggle path
CITIES = ["B", "C", "D"]                   # <-- IMPORTANT: Run one city at a time
COLUMNS = ["uid","d","t","x","y"]
DTYPES = {"uid":"int32","d":"int16","t":"int16","x":"int16","y":"int16"}

TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
TEST_DAY_MAX = 75
MASK_VALUE = 999
DELTA = 30   # minutes per segment (kept as in original)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED); random.seed(RANDOM_SEED)

TARGET_RANGES = {"A":(147001,150000),"B":(27001,30000),"C":(22001,25000),"D":(17001,20000)}

# grid candidates (CHANGED for HDBSCAN)
GRID_CANDIDATES = {
    "A":{"topL":[200,300], "min_cluster_size": [50, 100], "min_samples": [10, 25]},
    "B":{"topL":[150,200], "min_cluster_size": [25, 50], "min_samples": [5, 10]},
    "C":{"topL":[150,200,250], "min_cluster_size": [25, 50], "min_samples": [5, 10]},
    "D":{"topL":[100,200,300], "min_cluster_size": [20, 40], "min_samples": [5, 10]},
}

# ================= CRITICAL MEMORY FIX =================
# This is still required to prevent OOM crash in 'build_base_artifacts' for City A
CITY_CONFIG = {
    "A": {"TOP_L_MAX":150, "SAMPLE_FRAC_FOR_GRID":0.01, "MIN_TRANSITION_COUNT":3, "CLUSTER_BATCH":1024, "INCIDENTAL_THRESHOLD":5, "P_MIX":0.03},
    "B": {"TOP_L_MAX":250, "SAMPLE_FRAC_FOR_GRID":0.03, "MIN_TRANSITION_COUNT":1, "CLUSTER_BATCH":2048, "INCIDENTAL_THRESHOLD":2, "P_MIX":0.04},
    "C": {"TOP_L_MAX":250, "SAMPLE_FRAC_FOR_GRID":0.03, "MIN_TRANSITION_COUNT":1, "CLUSTER_BATCH":2048, "INCIDENTAL_THRESHOLD":2, "P_MIX":0.05},
    "D": {"TOP_L_MAX":300, "SAMPLE_FRAC_FOR_GRID":0.04, "MIN_TRANSITION_COUNT":1, "CLUSTER_BATCH":2048, "INCIDENTAL_THRESHOLD":1, "P_MIX":0.07},
}
# ========================================================

MAX_USERS_FOR_GRID = 1500
PRUNE_CLUSTERTT_IF_SMALL = 500
CLUSTER_PREF_RATIO = 0.6
N_JOBS = 1  # keep 1 on Kaggle free to avoid fork/pickle overhead
OUT_DIR = "./results"
os.makedirs(OUT_DIR, exist_ok=True)

MAKE_SUBMISSION = True
RUN_FULL_FINAL = True
USE_GEOLEU_BULK = True

# ----------------------------
# Helpers
# ----------------------------
def load_city_df(city):
    path = os.path.join(DATA_DIR, f"city_{city}_challengedata.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing data: {path}")
    return pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES)

def to_flat_segment(d,t,delta=DELTA):
    return int(d * ((24*60)//delta) + (t*60)//delta)

def chebyshev(a,b):
    return max(abs(a[0]-b[0]), abs(a[1]-b[1]))

def prune_counter_dict(counter_obj, min_count=1):
    return {k:int(v) for k,v in counter_obj.items() if int(v) >= min_count}

# ----------------------------
# Held-user sampling (user-level)
# ----------------------------
def sample_heldout_users(df, frac, seed=RANDOM_SEED):
    mask = (df["d"].between(TEST_DAY_MIN, TEST_DAY_MAX)) & (df["x"] != MASK_VALUE)
    users = df.loc[mask, "uid"].unique().tolist()
    if not users:
        return []
    rnd = random.Random(seed)
    rnd.shuffle(users)
    k = max(1, int(len(users) * frac))
    return [int(u) for u in users[:k]]

# ----------------------------
# Build artifacts (user-level TT, hourly profiles, topK)
# ----------------------------
def build_base_artifacts(df, top_l_max, held_uids=None, min_trans=1, mask_targets=False, city=None, min_global_cell_count=1):
    """
    Build artifacts from allowed unmasked rows.
    (This function is unchanged, but will use the memory-saving config for City A)
    """
    if held_uids is None: held_uids = []

    allowed_mask = (df["x"] != MASK_VALUE)
    if held_uids:
        held_mask = df["uid"].isin(held_uids) & df["d"].between(TEST_DAY_MIN, TEST_DAY_MAX)
        allowed_mask = allowed_mask & ~held_mask
    if mask_targets and city is not None:
        lo,hi = TARGET_RANGES[city]
        target_mask = df["uid"].between(lo,hi) & (df["d"] > TRAIN_DAY_MAX)
        allowed_mask = allowed_mask & ~target_mask

    allowed_train = df.loc[allowed_mask & (df["d"] <= TRAIN_DAY_MAX)].copy()
    cell_counts = Counter(zip(allowed_train["x"], allowed_train["y"]))
    filtered_cells = [(loc,cnt) for loc,cnt in cell_counts.items() if cnt >= min_global_cell_count]
    filtered_cells.sort(key=lambda kv:kv[1], reverse=True)
    top_cells_full = [loc for loc,_ in filtered_cells[:top_l_max]]

    hour_counts = defaultdict(Counter)
    for t,x,y in zip(allowed_train["t"], allowed_train["x"], allowed_train["y"]):
        hour_counts[int(t)][(int(x),int(y))] += 1
    city_profile = {h: max(c.items(), key=lambda kv:kv[1])[0] for h,c in hour_counts.items() if c}

    userTT = {}
    user_hour = {}
    user_topK = {}
    last_known = {}

    for uid, g in tqdm(allowed_train.groupby("uid", sort=False), desc="build user artifacts"):
        uid0 = int(uid)
        seq = [(to_flat_segment(int(d),int(t)), (int(x),int(y))) for d,t,x,y in zip(g["d"],g["t"],g["x"],g["y"])]
        seq.sort()
        tt = defaultdict(Counter)
        for (s1,l1),(s2,l2) in zip(seq, seq[1:]):
            if 0 < s2 - s1 <= 3:
                tt[(s1,l1)][l2] += 1
        tt_pruned = {}
        for k,v in tt.items():
            dct = prune_counter_dict(v, min_trans)
            if dct:
                tt_pruned[k] = dct
        if tt_pruned:
            userTT[uid0] = tt_pruned

        hc = defaultdict(Counter)
        for t,x,y in zip(g["t"], g["x"], g["y"]):
            hc[int(t)][(int(x),int(y))] += 1
        if hc:
            user_hour[uid0] = {h:max(c.items(), key=lambda kv:kv[1])[0] for h,c in hc.items()}

        locs = Counter(zip(g["x"], g["y"]))
        user_topK[uid0] = [loc for loc,_ in locs.most_common(3)]

        last_row = g.loc[g["d"] <= TRAIN_DAY_MAX].sort_values(["d","t"]).tail(1)
        last_known[uid0] = None if last_row.empty else (int(last_row["x"].iloc[0]), int(last_row["y"].iloc[0]))

    df_test = df.loc[df["d"].between(TEST_DAY_MIN, TEST_DAY_MAX)].sort_values(["uid","d","t"])
    user_test_rows = {}
    for uid, g in df_test.groupby("uid", sort=False):
        user_test_rows[int(uid)] = [(int(idx), int(r.d), int(r.t), int(r.x), int(r.y)) for idx, r in g.iterrows()]

    artifacts = {
        "top_cells_full": top_cells_full,
        "city_profile": city_profile,
        "userTT": userTT,
        "user_hour": user_hour,
        "user_topK": user_topK,
        "last_known": last_known,
        "user_test_rows": user_test_rows
    }
    return artifacts

# ----------------------------
# Precompute per-user vectors for clustering
# ----------------------------
def precompute_user_vectors(df, artifacts, top_l_max):
    top_cells = artifacts["top_cells_full"]
    top_index = {loc:i for i,loc in enumerate(top_cells)}
    mask = (df["d"] <= TRAIN_DAY_MAX) & (df["x"] != MASK_VALUE)
    user_vecs = {}
    for uid, g in tqdm(df.loc[mask].groupby("uid", sort=False), desc="precompute vecs"):
        uid0 = int(uid)
        vec = np.zeros(len(top_cells), dtype=np.float32)
        for x,y in zip(g["x"], g["y"]):
            k = (int(x), int(y))
            if k in top_index:
                vec[top_index[k]] += 1.0
        if vec.sum() > 0:
            vec /= (np.linalg.norm(vec) + 1e-9)
        user_vecs[uid0] = vec
    return user_vecs

# ----------------------------
# Clustering -> uid -> cluster label
# ----------------------------
# NEW: Rewritten for HDBSCAN
def cluster_for_params(user_vecs, topL, min_cluster_size=50, min_samples=10, batch_size=None):
    """
    Clusters users based on their vectors using HDBSCAN.
    """
    uids = list(user_vecs.keys())
    if not uids:
        return {}
    X = np.vstack([user_vecs[uid][:topL] for uid in uids])

    # HDBSCAN is generally good with default metric 'euclidean' for vectors
    # We can use core_dist_n_jobs to speed it up, matching our global N_JOBS
    model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        core_dist_n_jobs=N_JOBS 
    )
    labels = model.fit_predict(X)
    return dict(zip(uids, labels))

# ----------------------------
# Build cluster-level auxiliaries (hour, TT)
# ----------------------------
# NEW: Modified to handle noise label -1
def build_cluster_aux(df_subset, uid_to_cluster, min_trans=1):
    """
    df_subset: dataframe used for cluster stats (should be allowed training rows)
    """
    mask = (df_subset["d"] <= TRAIN_DAY_MAX) & (df_subset["x"] != MASK_VALUE)
    train = df_subset.loc[mask]
    cl_hour_counts = defaultdict(lambda: defaultdict(Counter))
    cl_tt_counts = defaultdict(lambda: defaultdict(Counter))

    for uid, g in train.groupby("uid", sort=False):
        uid0 = int(uid); cl = uid_to_cluster.get(uid0)
        
        # --- CRITICAL FIX FOR (H)DBSCAN ---
        # Ignore users labeled as 'noise' (-1) or not in a cluster (None)
        if cl is None or cl == -1: 
            continue
        # ----------------------------------

        for t,x,y in zip(g["t"], g["x"], g["y"]):
            cl_hour_counts[cl][int(t)][(int(x),int(y))] += 1
        seq = [(to_flat_segment(int(d),int(t)), (int(x),int(y))) for d,t,x,y in zip(g["d"],g["t"],g["x"],g["y"])]
        seq.sort()
        for (s1,l1),(s2,l2) in zip(seq, seq[1:]):
            if 0 < s2 - s1 <= 3:
                cl_tt_counts[cl][(s1,l1)][l2] += 1

    cluster_hour = {}
    cluster_hour_top_counts = {}
    cluster_hour_total = {}
    for cl, hr in cl_hour_counts.items():
        cluster_hour[cl] = {}
        cluster_hour_top_counts[cl] = {}
        cluster_hour_total[cl] = {}
        for h, counter in hr.items():
            most_common = counter.most_common(1)
            if most_common:
                top_loc, top_count = most_common[0]
                total = sum(counter.values())
                cluster_hour[cl][h] = top_loc
                cluster_hour_top_counts[cl][h] = int(top_count)
                cluster_hour_total[cl][h] = int(total)

    clusterTT = {}
    for cl, tt in cl_tt_counts.items():
        pruned = {}
        for k, v in tt.items():
            dct = prune_counter_dict(v, min_trans)
            if dct:
                pruned[k] = dct
        if pruned:
            clusterTT[cl] = pruned

    total_trans = sum(sum(sum(v.values()) for v in tt.values()) for tt in cl_tt_counts.values())
    return cluster_hour, cluster_hour_top_counts, cluster_hour_total, clusterTT, total_trans

# ----------------------------
# Predict user sequentially (used in eval and final)
# ----------------------------
# NEW: Modified to handle noise label -1
def predict_user_seq(uid, rows, artifacts, uid_to_cluster, cluster_hour, cluster_hour_top, cluster_hour_total, clusterTT, held_uids_set, use_cluster_tt=True, cluster_pref_ratio=CLUSTER_PREF_RATIO, p_mix=0.05):
    preds = []; gts = []
    curr = artifacts.get("last_known", {}).get(uid, (0,0))
    stay_idx = 0
    for idx, d, t, x, y in rows:
        seg = to_flat_segment(d,t); hour = int(t)
        if uid in held_uids_set:
            # mixing: sometimes pick hourly-based prediction directly to reduce drift
            if random.random() < p_mix:
                # pick best hourly from user or cluster or city
                cl = uid_to_cluster.get(uid, None) # cl can be -1 here, which is fine
                picked = None
                # cluster strong preference
                if cl is not None and cl != -1 and cl in cluster_hour_top and hour in cluster_hour_top[cl]:
                    top_count = cluster_hour_top[cl][hour]
                    total = cluster_hour_total[cl].get(hour,1)
                    if total>0 and (top_count/float(total) >= cluster_pref_ratio):
                        picked = cluster_hour[cl].get(hour, None)
                if picked is None and uid in artifacts["user_hour"] and hour in artifacts["user_hour"][uid]:
                    picked = artifacts["user_hour"][uid][hour]
                if picked is None and cl is not None and cl != -1 and cl in cluster_hour and hour in cluster_hour[cl]:
                    picked = cluster_hour[cl][hour]
                if picked is None and uid in artifacts["user_topK"] and artifacts["user_topK"][uid]:
                    picked = artifacts["user_topK"][uid][stay_idx % len(artifacts["user_topK"][uid])]
                    stay_idx += 1
                if picked is None:
                    picked = artifacts["city_profile"].get(hour, curr)
                preds.append((int(d), int(t), int(picked[0]), int(picked[1])))
                gts.append((int(d), int(t), int(x), int(y)))
                curr = picked
                continue

            cand = Counter()
            # userTT
            if uid in artifacts["userTT"]:
                cand.update(artifacts["userTT"][uid].get((seg,curr), {}))
            # clusterTT fallback
            cl = uid_to_cluster.get(uid, None) # cl can be -1
            if use_cluster_tt and cl is not None and cl != -1: # Don't use clusterTT if user is noise
                cand.update(clusterTT.get(cl, {}).get((seg,curr), {}))
            if cand:
                # pick the most frequent, but if top == curr, try second-best
                sorted_cands = sorted(cand.items(), key=lambda kv: (-kv[1], chebyshev(kv[0], curr)))
                top_loc, top_count = sorted_cands[0]
                pred = top_loc
                # if the top predicted location equals current location, try second entry
                if top_loc == curr and len(sorted_cands) > 1:
                    second_loc, second_count = sorted_cands[1]
                    if (second_count >= 1) and (second_count >= 0.5 * top_count or top_count == 1):
                        pred = second_loc
                    else:
                        pred = top_loc
                preds.append((int(d), int(t), int(pred[0]), int(pred[1])))
            else:
                # cluster hourly strong preference or else user/cluster/hour/topK/city fallback (same as before)
                cl_prefed = False
                if cl is not None and cl != -1 and cl in cluster_hour_top and hour in cluster_hour_top[cl]:
                    top_count = cluster_hour_top[cl][hour]
                    total = cluster_hour_total[cl].get(hour,1)
                    if total>0 and (top_count/float(total) >= cluster_pref_ratio):
                        cl_prefed = True
                if cl_prefed and cl is not None and cl != -1 and cl in cluster_hour and hour in cluster_hour[cl]:
                    pred = cluster_hour[cl][hour]
                elif uid in artifacts["user_hour"] and hour in artifacts["user_hour"][uid]:
                    pred = artifacts["user_hour"][uid][hour]
                elif cl is not None and cl != -1 and cl in cluster_hour and hour in cluster_hour[cl]:
                    pred = cluster_hour[cl][hour]
                elif uid in artifacts["user_topK"] and artifacts["user_topK"][uid]:
                    pred = artifacts["user_topK"][uid][stay_idx % len(artifacts["user_topK"][uid])]
                    stay_idx += 1
                else:
                    pred = artifacts["city_profile"].get(hour, curr)
                preds.append((int(d), int(t), int(pred[0]), int(pred[1])))
            gts.append((int(d), int(t), int(x), int(y)))
            curr = preds[-1][2:]  # update current to last predicted
        else:
            if int(x) != MASK_VALUE:
                curr = (int(x), int(y))
    return preds, gts

# ----------------------------
# Evaluate (sequential)
# ----------------------------
def evaluate_with_cluster(df, artifacts, uid_to_cluster, cluster_hour, cluster_hour_top, cluster_hour_total, clusterTT, held_uids, users_to_eval, use_cluster_tt=True, p_mix=0.05):
    # This function is unchanged
    tasks = users_to_eval
    preds_map = {}; gts_map = {}
    held_set = set(held_uids)
    for uid in tqdm(tasks, desc="predict users (seq)"):
        rows = artifacts["user_test_rows"].get(uid, [])
        if not rows: continue
        preds, gts = predict_user_seq(uid, rows, artifacts, uid_to_cluster, cluster_hour, cluster_hour_top, cluster_hour_total, clusterTT, held_set, use_cluster_tt, CLUSTER_PREF_RATIO, p_mix=p_mix)
        if preds:
            preds_map[uid] = preds; gts_map[uid] = gts

    if calc_geobleu_bulk is not None and USE_GEOLEU_BULK:
        gen_bulk = []
        ref_bulk = []
        for uid, seq in preds_map.items():
            for d,t,x,y in seq:
                gen_bulk.append((int(uid), int(d), int(t), int(x), int(y)))
        for uid, seq in gts_map.items():
            for d,t,x,y in seq:
                ref_bulk.append((int(uid), int(d), int(t), int(x), int(y)))
        try:
            score = float(calc_geobleu_bulk(gen_bulk, ref_bulk, processes=1))
            info = {"num_users": len(preds_map), "num_preds": sum(len(v) for v in preds_map.values())}
            return score, info, preds_map
        except Exception:
            pass

    scores = []
    for uid in preds_map.keys():
        try:
            p = [pt[2:] for pt in preds_map[uid]]
            g = [pt[2:] for pt in gts_map[uid]]
            scores.append(calc_geobleu_single(p, g))
        except Exception:
            scores.append(0.0)
    mean_score = float(np.mean(scores)) if scores else 0.0
    info = {"num_users": len(preds_map), "num_preds": sum(len(v) for v in preds_map.values())}
    return mean_score, info, preds_map

# ----------------------------
# Sampled grid search
# - CHANGED: Loops over 'min_cluster_size' and 'min_samples' for HDBSCAN
# ----------------------------
def sampled_grid_search(df, city, grid_candidates, cfg):
    print("[grid] sampling held users ...")
    held_uids = sample_heldout_users(df, frac=cfg["SAMPLE_FRAC_FOR_GRID"])
    print(f"[grid] sampled held users: {len(held_uids)}")
    artifacts = build_base_artifacts(df, top_l_max=cfg["TOP_L_MAX"], held_uids=held_uids, min_trans=cfg["MIN_TRANSITION_COUNT"], min_global_cell_count=cfg.get("INCIDENTAL_THRESHOLD",1))
    user_vecs = precompute_user_vectors(df, artifacts, cfg["TOP_L_MAX"])
    users_for_grid = held_uids[:MAX_USERS_FOR_GRID]
    grid_results = []

    # NEW: Loop over HDBSCAN parameters
    for topL in grid_candidates["topL"]:
        for mcs in grid_candidates.get("min_cluster_size", [50]):
            for ms in grid_candidates.get("min_samples", [10]):
                print(f"[grid] try topL={topL}, min_cluster_size={mcs}, min_samples={ms}")
                # NEW: Pass HDBSCAN params
                uid_to_cluster = cluster_for_params(user_vecs, topL, min_cluster_size=mcs, min_samples=ms)
    
                allowed_mask = (df["x"] != MASK_VALUE)
                if held_uids:
                    held_mask = df["uid"].isin(held_uids) & df["d"].between(TEST_DAY_MIN, TEST_DAY_MAX)
                    allowed_mask = allowed_mask & ~held_mask
                df_subset = df.loc[allowed_mask & (df["d"] <= TRAIN_DAY_MAX)].copy()
                # build_cluster_aux will now correctly ignore -1 labels
                cluster_hour, cluster_hour_top, cluster_hour_total, clusterTT, total_trans = build_cluster_aux(df_subset, uid_to_cluster, min_trans=cfg["MIN_TRANSITION_COUNT"])
                use_cluster_tt = total_trans >= PRUNE_CLUSTERTT_IF_SMALL
                if not use_cluster_tt: clusterTT = {}
                score, info, _ = evaluate_with_cluster(df, artifacts, uid_to_cluster, cluster_hour, cluster_hour_top, cluster_hour_total, clusterTT, held_uids, users_for_grid, use_cluster_tt, p_mix=cfg.get("P_MIX", 0.05))
                print(f" -> score={score:.5f}, users_eval={info['num_users']}, preds={info['num_preds']}")
                
                # NEW: Save HDBSCAN params
                grid_results.append({
                    "topL": topL, 
                    "min_cluster_size": mcs, 
                    "min_samples": ms, 
                    "score": score, 
                    "use_cluster_tt": use_cluster_tt
                })
                
                with open(os.path.join(OUT_DIR, f"{city}_grid_progress.json"), "w") as f:
                    json.dump(grid_results, f, indent=2)

    best = max(grid_results, key=lambda r:r["score"]) if grid_results else None
    print("[grid] best:", best)
    return best, artifacts, held_uids, grid_results

# ----------------------------
# Final training & submission
# - CHANGED: Accepts HDBSCAN params and adds gc.collect()
# ----------------------------
def final_train_and_generate_submission(df, city, artifacts_from_grid, held_uids, topL, min_cluster_size, min_samples, cfg, make_submission=MAKE_SUBMISSION):
    print("[final] building final artifacts using allowed unmasked rows (may take time)...")
    # This will use the tuned config for City A (MIN_TRANSITION_COUNT=3, INCIDENTAL_THRESHOLD=5)
    artifacts_final = build_base_artifacts(df, top_l_max=cfg["TOP_L_MAX"], held_uids=None, min_trans=cfg["MIN_TRANSITION_COUNT"], mask_targets=True, city=city, min_global_cell_count=cfg.get("INCIDENTAL_THRESHOLD",1))
    
    print("[final] collecting garbage...")
    gc.collect() 

    print("[final] precomputing user vectors...")
    user_vecs_final = precompute_user_vectors(df, artifacts_final, cfg["TOP_L_MAX"])
    
    print("[final] collecting garbage...")
    gc.collect()

    print("[final] clustering users...")
    # NEW: Pass HDBSCAN params
    uid_to_cluster_final = cluster_for_params(user_vecs_final, topL, min_cluster_size=min_cluster_size, min_samples=min_samples)
    
    print("[final] building cluster auxiliaries...")
    lo,hi = TARGET_RANGES[city]
    is_target = df["uid"].between(lo,hi)
    allowed_mask = (df["x"] != MASK_VALUE) & ~(is_target & (df["d"] > TRAIN_DAY_MAX))
    df_allowed = df.loc[allowed_mask & (df["d"] <= TRAIN_DAY_MAX)].copy()
    cluster_hour_final, cluster_hour_top_final, cluster_hour_total_final, clusterTT_final, total_trans_final = build_cluster_aux(df_allowed, uid_to_cluster_final, min_trans=cfg["MIN_TRANSITION_COUNT"])
    use_cluster_tt_final = total_trans_final >= PRUNE_CLUSTERTT_IF_SMALL
    if not use_cluster_tt_final: clusterTT_final = {}

    print("[final] local evaluation on held users...")
    score_local, info_local, preds_local = evaluate_with_cluster(df, artifacts_final, uid_to_cluster_final, cluster_hour_final, cluster_hour_top_final, cluster_hour_total_final, clusterTT_final, held_uids, held_uids, use_cluster_tt_final, p_mix=cfg.get("P_MIX",0.05))
    print(f"[final] local GeoBLEU (final-trained): {score_local:.5f} info: {info_local}")
    
    # NEW: Save HDBSCAN params
    with open(os.path.join(OUT_DIR, f"{city}_final_info.json"), "w") as f:
        json.dump({
            "score_local":score_local, 
            "info":info_local, 
            "topL":topL, 
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples
        }, f, indent=2)

    submission_path = None
    if make_submission:
        print("[final] generating official predictions for masked cells...")
        lo,hi = TARGET_RANGES[city]
        df_masked = df.loc[df["uid"].between(lo,hi) & df["d"].between(TEST_DAY_MIN, TEST_DAY_MAX) & (df["x"] == MASK_VALUE)].sort_values(["uid","d","t"])
        rows_out = []
        held_set = set(held_uids)
        grouped = df_masked.groupby("uid", sort=False)
        for uid, g in tqdm(grouped, desc="predict submission users"):
            curr = artifacts_final["last_known"].get(int(uid), None)
            if curr is None:
                if int(uid) in artifacts_final["user_topK"] and artifacts_final["user_topK"].get(int(uid)):
                    curr = artifacts_final["user_topK"][int(uid)][0]
                else:
                    first_row = g.iloc[0]
                    curr = artifacts_final["city_profile"].get(int(first_row["t"]), (0,0))
            stay_idx = 0
            for _, row in g.iterrows():
                d = int(row["d"]); t = int(row["t"])
                seg = to_flat_segment(d,t); hour = int(t)
                cl = uid_to_cluster_final.get(int(uid), None) # cl can be -1
                
                if random.random() < cfg.get("P_MIX", 0.05):
                    chosen = None
                    if cl is not None and cl != -1 and cl in cluster_hour_top_final and hour in cluster_hour_top_final[cl]:
                        top_count = cluster_hour_top_final[cl][hour]
                        total = cluster_hour_total_final[cl].get(hour,1)
                        if total>0 and (top_count/float(total) >= CLUSTER_PREF_RATIO):
                            chosen = cluster_hour_final[cl].get(hour, None)
                    if chosen is None and int(uid) in artifacts_final["user_hour"] and hour in artifacts_final["user_hour"][int(uid)]:
                        chosen = artifacts_final["user_hour"][int(uid)][hour]
                    if chosen is None and cl is not None and cl != -1 and cl in cluster_hour_final and hour in cluster_hour_final[cl]:
                        chosen = cluster_hour_final[cl][hour]
                    if chosen is None and int(uid) in artifacts_final["user_topK"] and artifacts_final["user_topK"][int(uid)]:
                        chosen = artifacts_final["user_topK"][int(uid)][stay_idx % len(artifacts_final["user_topK"][int(uid)])]
                        stay_idx += 1
                    if chosen is None:
                        chosen = artifacts_final["city_profile"].get(hour, curr)
                    pred = chosen
                else:
                    cand = Counter()
                    uid0 = int(uid)
                    if uid0 in artifacts_final["userTT"]:
                        cand.update(artifacts_final["userTT"][uid0].get((seg,curr), {}))
                    if use_cluster_tt_final and cl is not None and cl != -1: # Don't use clusterTT if noise
                        cand.update(clusterTT_final.get(cl, {}).get((seg,curr), {}))
                    if cand:
                        sorted_cands = sorted(cand.items(), key=lambda kv: (-kv[1], chebyshev(kv[0], curr)))
                        top_loc, top_count = sorted_cands[0]
                        pred = top_loc
                        if top_loc == curr and len(sorted_cands) > 1:
                            second_loc, second_count = sorted_cands[1]
                            if (second_count >= 1) and (second_count >= 0.5 * top_count or top_count == 1):
                                pred = second_loc
                    else:
                        cl_prefed = False
                        if cl is not None and cl != -1 and cl in cluster_hour_top_final and hour in cluster_hour_top_final[cl]:
                            top_count = cluster_hour_top_final[cl][hour]
                            total = cluster_hour_total_final[cl].get(hour,1)
                            if total>0 and (top_count/float(total) >= CLUSTER_PREF_RATIO):
                                cl_prefed = True
                        if cl_prefed and cl is not None and cl != -1 and cl in cluster_hour_final and hour in cluster_hour_final[cl]:
                            pred = cluster_hour_final[cl][hour]
                        elif uid0 in artifacts_final["user_hour"] and hour in artifacts_final["user_hour"][uid0]:
                            pred = artifacts_final["user_hour"][uid0][hour]
                        elif cl is not None and cl != -1 and cl in cluster_hour_final and hour in cluster_hour_final[cl]:
                            pred = cluster_hour_final[cl][hour]
                        elif uid0 in artifacts_final["user_topK"] and artifacts_final["user_topK"][uid0]:
                            pred = artifacts_final["user_topK"][uid0][stay_idx % len(artifacts_final["user_topK"][uid0])]
                            stay_idx += 1
                        else:
                            pred = artifacts_final["city_profile"].get(hour, curr)
                rows_out.append({"uid": int(uid), "d": d, "t": t, "x": int(pred[0]), "y": int(pred[1])})
                curr = pred
        out_df = pd.DataFrame(rows_out)
        submission_path = os.path.join(OUT_DIR, f"{city}_submission.csv")
        out_df.to_csv(submission_path, index=False)
        
        # --- THIS IS THE FIX ---
        print("[final] wrote submission to", submission_path)
        # -----------------------
        
    return score_local, info_local, submission_path

# ----------------------------
# Smoke test (Commented out)
# ----------------------------
# city = "D"
# df = load_city_df(city)
# df_small = df.sample(frac=0.01, random_state=RANDOM_SEED)  # 1% for smoke
# cfg = CITY_CONFIG[city]
# best, artifacts, held_uids, grid_results = sampled_grid_search(df_small, city, GRID_CANDIDATES[city], cfg)
# print("SMOKE best:", best)
# # NEW: Must pass HDBSCAN params
# final_train_and_generate_submission(df_small, city, artifacts, held_uids, best["topL"], best["min_cluster_size"], best["min_samples"], cfg, make_submission=False)
# raise SystemExit("Smoke test done")

# ----------------------------
# Main driver (per-city)
# ----------------------------
if __name__ == "__main__":
    total_start = time.time()
    for city in CITIES:
        print("\n\n====================")
        print("RUNNING CITY:", city)
        print("====================\n")
        cfg = CITY_CONFIG[city] # <-- This will now pull the correct memory config
        df = load_city_df(city)

        best, artifacts, held_uids, grid_results = sampled_grid_search(df, city, GRID_CANDIDATES[city], cfg)
        print("Grid best:", best)

        del artifacts
        gc.collect()

        if RUN_FULL_FINAL and best is not None:
            # NEW: Pass best["min_cluster_size"] and best["min_samples"]
            score_local, info_local, submission_path = final_train_and_generate_submission(
                df, city, None, held_uids, 
                best["topL"], best["min_cluster_size"], best["min_samples"], 
                cfg, make_submission=MAKE_SUBMISSION
            )
            print(f"City {city} final local GeoBLEU: {score_local:.5f}, info: {info_local}, submission: {submission_path}")
        else:
            print(f"City {city} - grid only (no final).")

        del df, held_uids, grid_results
        gc.collect()

    print("Total elapsed (s):", int(time.time() - total_start))

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


RUNNING CITY: B

[grid] sampling held users ...
[grid] sampled held users: 810


build user artifacts:   0%|          | 0/30000 [00:00<?, ?it/s]

precompute vecs:   0%|          | 0/30000 [00:00<?, ?it/s]

[grid] try topL=150, min_cluster_size=25, min_samples=5


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10724, users_eval=810, preds=111577
[grid] try topL=150, min_cluster_size=25, min_samples=10


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10713, users_eval=810, preds=111577
[grid] try topL=150, min_cluster_size=50, min_samples=5


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10722, users_eval=810, preds=111577
[grid] try topL=150, min_cluster_size=50, min_samples=10


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10718, users_eval=810, preds=111577
[grid] try topL=200, min_cluster_size=25, min_samples=5


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10711, users_eval=810, preds=111577
[grid] try topL=200, min_cluster_size=25, min_samples=10


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10704, users_eval=810, preds=111577
[grid] try topL=200, min_cluster_size=50, min_samples=5


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10697, users_eval=810, preds=111577
[grid] try topL=200, min_cluster_size=50, min_samples=10


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

 -> score=0.10706, users_eval=810, preds=111577
[grid] best: {'topL': 150, 'min_cluster_size': 25, 'min_samples': 5, 'score': 0.10723528957341069, 'use_cluster_tt': True}
Grid best: {'topL': 150, 'min_cluster_size': 25, 'min_samples': 5, 'score': 0.10723528957341069, 'use_cluster_tt': True}
[final] building final artifacts using allowed unmasked rows (may take time)...


build user artifacts:   0%|          | 0/30000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] precomputing user vectors...


precompute vecs:   0%|          | 0/30000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] clustering users...
[final] building cluster auxiliaries...
[final] local evaluation on held users...


predict users (seq):   0%|          | 0/810 [00:00<?, ?it/s]

[final] local GeoBLEU (final-trained): 0.10724 info: {'num_users': 810, 'num_preds': 111577}
[final] generating official predictions for masked cells...


predict submission users:   0%|          | 0/3000 [00:00<?, ?it/s]

[final] wrote submission to ./results/B_submission.csv
City B final local GeoBLEU: 0.10724, info: {'num_users': 810, 'num_preds': 111577}, submission: ./results/B_submission.csv


RUNNING CITY: C

[grid] sampling held users ...
[grid] sampled held users: 660


build user artifacts:   0%|          | 0/25000 [00:00<?, ?it/s]

precompute vecs:   0%|          | 0/25000 [00:00<?, ?it/s]

[grid] try topL=150, min_cluster_size=25, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09694, users_eval=660, preds=88238
[grid] try topL=150, min_cluster_size=25, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09689, users_eval=660, preds=88238
[grid] try topL=150, min_cluster_size=50, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09708, users_eval=660, preds=88238
[grid] try topL=150, min_cluster_size=50, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09698, users_eval=660, preds=88238
[grid] try topL=200, min_cluster_size=25, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09687, users_eval=660, preds=88238
[grid] try topL=200, min_cluster_size=25, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09695, users_eval=660, preds=88238
[grid] try topL=200, min_cluster_size=50, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09702, users_eval=660, preds=88238
[grid] try topL=200, min_cluster_size=50, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09707, users_eval=660, preds=88238
[grid] try topL=250, min_cluster_size=25, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09669, users_eval=660, preds=88238
[grid] try topL=250, min_cluster_size=25, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09641, users_eval=660, preds=88238
[grid] try topL=250, min_cluster_size=50, min_samples=5


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09642, users_eval=660, preds=88238
[grid] try topL=250, min_cluster_size=50, min_samples=10


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

 -> score=0.09635, users_eval=660, preds=88238
[grid] best: {'topL': 150, 'min_cluster_size': 50, 'min_samples': 5, 'score': 0.09707624699454155, 'use_cluster_tt': True}
Grid best: {'topL': 150, 'min_cluster_size': 50, 'min_samples': 5, 'score': 0.09707624699454155, 'use_cluster_tt': True}
[final] building final artifacts using allowed unmasked rows (may take time)...


build user artifacts:   0%|          | 0/25000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] precomputing user vectors...


precompute vecs:   0%|          | 0/25000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] clustering users...
[final] building cluster auxiliaries...
[final] local evaluation on held users...


predict users (seq):   0%|          | 0/660 [00:00<?, ?it/s]

[final] local GeoBLEU (final-trained): 0.09708 info: {'num_users': 660, 'num_preds': 88238}
[final] generating official predictions for masked cells...


predict submission users:   0%|          | 0/3000 [00:00<?, ?it/s]

[final] wrote submission to ./results/C_submission.csv
City C final local GeoBLEU: 0.09708, info: {'num_users': 660, 'num_preds': 88238}, submission: ./results/C_submission.csv


RUNNING CITY: D

[grid] sampling held users ...
[grid] sampled held users: 680


build user artifacts:   0%|          | 0/20000 [00:00<?, ?it/s]

precompute vecs:   0%|          | 0/20000 [00:00<?, ?it/s]

[grid] try topL=100, min_cluster_size=20, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10740, users_eval=680, preds=93852
[grid] try topL=100, min_cluster_size=20, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10738, users_eval=680, preds=93852
[grid] try topL=100, min_cluster_size=40, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10743, users_eval=680, preds=93852
[grid] try topL=100, min_cluster_size=40, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10749, users_eval=680, preds=93852
[grid] try topL=200, min_cluster_size=20, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10727, users_eval=680, preds=93852
[grid] try topL=200, min_cluster_size=20, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10735, users_eval=680, preds=93852
[grid] try topL=200, min_cluster_size=40, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10740, users_eval=680, preds=93852
[grid] try topL=200, min_cluster_size=40, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10747, users_eval=680, preds=93852
[grid] try topL=300, min_cluster_size=20, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10704, users_eval=680, preds=93852
[grid] try topL=300, min_cluster_size=20, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10715, users_eval=680, preds=93852
[grid] try topL=300, min_cluster_size=40, min_samples=5


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10475, users_eval=680, preds=93852
[grid] try topL=300, min_cluster_size=40, min_samples=10


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

 -> score=0.10477, users_eval=680, preds=93852
[grid] best: {'topL': 100, 'min_cluster_size': 40, 'min_samples': 10, 'score': 0.10748673796941166, 'use_cluster_tt': True}
Grid best: {'topL': 100, 'min_cluster_size': 40, 'min_samples': 10, 'score': 0.10748673796941166, 'use_cluster_tt': True}
[final] building final artifacts using allowed unmasked rows (may take time)...


build user artifacts:   0%|          | 0/20000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] precomputing user vectors...


precompute vecs:   0%|          | 0/20000 [00:00<?, ?it/s]

[final] collecting garbage...
[final] clustering users...
[final] building cluster auxiliaries...
[final] local evaluation on held users...


predict users (seq):   0%|          | 0/680 [00:00<?, ?it/s]

[final] local GeoBLEU (final-trained): 0.10748 info: {'num_users': 680, 'num_preds': 93852}
[final] generating official predictions for masked cells...


predict submission users:   0%|          | 0/3000 [00:00<?, ?it/s]

[final] wrote submission to ./results/D_submission.csv
City D final local GeoBLEU: 0.10748, info: {'num_users': 680, 'num_preds': 93852}, submission: ./results/D_submission.csv
Total elapsed (s): 14781
