In [1]:
# Install required packages
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm scikit-learn

import pandas as pd
import numpy as np
import os, json
from datetime import datetime
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import multiprocessing as mp
from geobleu import calc_geobleu_single
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.float_format", "{:.5f}".format)


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
# TT-KNN experiment values — paper defaults, change for ablation
TTKNN_VALUES = {
    "TAU_LIST": [5],      # Minimum visits to keep a location
    "DELTA_LIST": [30],   # Minutes per time segment
    "M_LIST": [2],        # How many future segments to consider
    "K_LIST": [2]         # How many nearest neighbors
}

# Dataset setup
DATA_DIR = "/kaggle/input/humob-data/15313913"
CITIES = ["D"]  # change to ["A", "B", "C", "D"] for full evaluation
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {"uid": "int32", "d": "int8", "t": "int8", "x": "int16", "y": "int16"}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
MASK_VALUE = 999
CHUNK_SIZE = 500_000
CHECKPOINT_DIR = "/kaggle/working/checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


In [3]:
def to_segment(t, delta):
    return (t * 60) // delta

def euclidean_distance(loc1, loc2):
    return np.sqrt((loc1[0]-loc2[0])**2 + (loc1[1]-loc2[1])**2)

def build_TT_index(trajectory, tau, delta):
    location_counts = Counter((x, y) for _, _, x, y in trajectory)
    traj_filtered = [(d, t, x, y) for (d, t, x, y) in trajectory if location_counts[(x, y)] >= tau]
    seg_traj = [(d, to_segment(t, delta), (x, y)) for d, t, x, y in traj_filtered]
    seg_traj.sort()

    TT_index = defaultdict(lambda: defaultdict(list))
    segments_per_day = (24 * 60) // delta

    for i in range(len(seg_traj) - 1):
        d1, s1, l1 = seg_traj[i]
        d2, s2, l2 = seg_traj[i+1]
        if (d1 == d2 and s2 == (s1 + 1) % segments_per_day) or \
           (d2 == d1 + 1 and s1 == segments_per_day - 1 and s2 == 0):
            TT_index[s2][l1].append(l2)
    return TT_index

def predict_next_location_ttknn(TT_index, current_time, current_loc, M, K, delta):
    curr_seg = to_segment(current_time, delta)
    segments_per_day = (24 * 60) // delta
    candidates = []

    for i in range(1, M+1):
        future_seg = (curr_seg + i) % segments_per_day
        candidates.extend(TT_index[future_seg].get(current_loc, []))

    if not candidates:
        return current_loc

    unique = list(set(candidates))
    dists = [(euclidean_distance(current_loc, loc), loc) for loc in unique]
    dists.sort()

    for _, loc in dists[:K]:
        if loc != current_loc:
            return loc
    return current_loc


In [4]:
class TTKNNModel:
    def __init__(self, tau, delta, M, K):
        self.tau = tau
        self.delta = delta
        self.M = M
        self.K = K
        self.indices = {}

    def fit(self, user_trajs):
        self.indices = {}
        for uid, entries in tqdm(user_trajs.items(), desc="Building TT-KNN indices"):
            cleaned = [(d, t, x, y) for (x, y), (d, t) in entries]
            self.indices[uid] = build_TT_index(cleaned, self.tau, self.delta)

    def predict(self, uid, t, current_loc):
        if uid not in self.indices:
            return current_loc
        return predict_next_location_ttknn(
            self.indices[uid], t, current_loc, self.M, self.K, self.delta
        )


In [5]:
def load_training_data(city):
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    user_trajs = defaultdict(list)
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        train = chunk[chunk['d'] <= TRAIN_DAY_MAX]
        for uid, grp in train.groupby("uid"):
            locs = list(zip(grp['x'], grp['y']))
            times = list(zip(grp['d'], grp['t']))
            user_trajs[uid].extend(zip(locs, times))
    return dict(user_trajs)

def load_test_data(city):
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    parts = []
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["x"] != MASK_VALUE)
        parts.append(chunk[mask])
    return pd.concat(parts) if parts else pd.DataFrame(columns=COLUMNS)


In [6]:
def run_ttknn_prediction(city, tau, delta, M, K):
    print(f"\n▶ City={city} | τ={tau} | Δ={delta} | M={M} | K={K}")
    train = load_training_data(city)
    test = load_test_data(city)
    test = test.sort_values(["uid", "d", "t"])
    gt = test.rename(columns={"x": "x_gt", "y": "y_gt"})[["uid", "d", "t", "x_gt", "y_gt"]]

    model = TTKNNModel(tau, delta, M, K)
    model.fit(train)

    pred_df = test[["uid", "d", "t"]].copy()
    pred_df["x_pred"] = 0
    pred_df["y_pred"] = 0

    for uid, group in tqdm(test.groupby("uid"), desc="Predicting"):
        last_known = train[uid][-1][0] if uid in train else (100, 100)
        preds = []
        curr_loc = last_known
        for _, row in group.iterrows():
            pred = model.predict(uid, row["t"], curr_loc)
            preds.append(pred)
            curr_loc = pred
        idxs = group.index
        pred_df.loc[idxs, "x_pred"] = [p[0] for p in preds]
        pred_df.loc[idxs, "y_pred"] = [p[1] for p in preds]

    return pred_df, gt


In [7]:
def _user_geobleu(group):
    p_seq = list(zip(group['d'], group['t'], group['x_pred'], group['y_pred']))
    g_seq = list(zip(group['d'], group['t'], group['x_gt'], group['y_gt']))
    return calc_geobleu_single(p_seq, g_seq)

def evaluate_geobleu(pred_df, gt_df):
    df = pd.merge(pred_df, gt_df, on=['uid', 'd', 't'])
    users = [g for _, g in df.groupby("uid")]
    with mp.Pool(mp.cpu_count() - 1) as pool:
        scores = pool.map(_user_geobleu, users)
    return round(float(np.mean(scores)), 5)


In [8]:
def main():
    for city in CITIES:
        for tau in TTKNN_VALUES["TAU_LIST"]:
            for delta in TTKNN_VALUES["DELTA_LIST"]:
                for M in TTKNN_VALUES["M_LIST"]:
                    for K in TTKNN_VALUES["K_LIST"]:
                        pred_df, gt_df = run_ttknn_prediction(city, tau, delta, M, K)
                        score = evaluate_geobleu(pred_df, gt_df)
                        print(f"✅ GEO-BLEU = {score:.5f} | τ={tau}, Δ={delta}, M={M}, K={K}")

main()



▶ City=D | τ=5 | Δ=30 | M=2 | K=2


Building TT-KNN indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

✅ GEO-BLEU = 0.06170 | τ=5, Δ=30, M=2, K=2
