In [1]:
# Install necessary package
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm

import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import multiprocessing as mp
from geobleu import calc_geobleu_single
import warnings
warnings.filterwarnings("ignore")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Flat TT-KNN configuration
TTKNN_VALUES = {
    "TAU": 5,
    "DELTA": 30,   # 30 minutes
    "M": 2,        # future segments
    "K": 2,        # nearest neighbors
    "SAMPLE_FRAC": 0.1  # fraction of unmasked test data to simulate prediction
}

# Dataset setup
DATA_DIR = "/kaggle/input/humob-data/15313913"
CITIES = ["B", "C", "D"]  # Change to ["A", "B", "C", "D"] for all
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {"uid": "int32", "d": "int8", "t": "int8", "x": "int16", "y": "int16"}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
MASK_VALUE = 999
CHUNK_SIZE = 500_000


In [3]:
def euclidean_distance(loc1, loc2):
    return np.sqrt((loc1[0] - loc2[0])**2 + (loc1[1] - loc2[1])**2)

def to_flat_segment(d, t, delta=30):
    segments_per_day = (24 * 60) // delta
    return d * segments_per_day + (t * 60) // delta


In [4]:
def build_flat_TT_index(trajectory, tau=5, delta=30):
    location_counts = Counter((x, y) for _, _, x, y in trajectory)
    traj_filtered = [(d, t, x, y) for (d, t, x, y) in trajectory if location_counts[(x, y)] >= tau]
    
    seg_traj = [(to_flat_segment(d, t, delta), (x, y)) for d, t, x, y in traj_filtered]
    seg_traj.sort()
    
    TT_index = defaultdict(lambda: defaultdict(list))
    for i in range(len(seg_traj) - 1):
        seg1, loc1 = seg_traj[i]
        seg2, loc2 = seg_traj[i + 1]
        if 0 < seg2 - seg1 <= 3:  # allow up to 3-segment jumps (1.5 hours for Δ=30min)
            TT_index[seg1][loc1].append(loc2)

    return TT_index


In [5]:
def predict_next_location_flat(TT_index, d, t, current_loc, M=2, K=2, delta=30):
    curr_seg = to_flat_segment(d, t, delta)
    candidates = []

    for i in range(1, M + 1):
        future_seg = curr_seg + i
        candidates.extend(TT_index.get(future_seg, {}).get(current_loc, []))

    if not candidates:
        return current_loc

    unique = list(set(candidates))
    dists = [(euclidean_distance(current_loc, loc), loc) for loc in unique]
    dists.sort()

    for _, loc in dists[:K]:
        if loc != current_loc:
            return loc
    return current_loc


In [6]:
class FlatTTKNNModel:
    def __init__(self, tau=5, delta=30, M=2, K=2):
        self.tau = tau
        self.delta = delta
        self.M = M
        self.K = K
        self.index = {}

    def fit(self, user_trajectories):
        for uid, traj in tqdm(user_trajectories.items(), desc="Building TT indices"):
            formatted = [(d, t, x, y) for (x, y), (d, t) in traj]
            self.index[uid] = build_flat_TT_index(formatted, self.tau, self.delta)

    def predict(self, uid, d, t, current_loc):
        if uid not in self.index:
            return current_loc
        return predict_next_location_flat(self.index[uid], d, t, current_loc, self.M, self.K, self.delta)


In [7]:
def load_training_data(city):
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    user_trajs = defaultdict(list)
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        train = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        for uid, group in train.groupby("uid"):
            locs = list(zip(group["x"], group["y"]))
            times = list(zip(group["d"], group["t"]))
            user_trajs[uid].extend(zip(locs, times))
    return dict(user_trajs)

def load_test_sample(city, sample_frac=0.1, seed=42):
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    test_parts = []
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["x"] != MASK_VALUE)
        test_parts.append(chunk[mask])
    test_df = pd.concat(test_parts).reset_index(drop=True)
    print(f"Unmasked test data: {len(test_df)} rows")

    np.random.seed(seed)
    sampled = test_df.sample(frac=sample_frac).copy()
    sampled["x_orig"] = sampled["x"]
    sampled["y_orig"] = sampled["y"]
    sampled["x"] = MASK_VALUE
    sampled["y"] = MASK_VALUE

    full_test = pd.concat([test_df.drop(sampled.index), sampled], ignore_index=True)
    full_test = full_test.sort_values(["uid", "d", "t"]).reset_index(drop=True)
    return full_test, sampled[["uid", "d", "t", "x_orig", "y_orig"]]


In [8]:
def run_prediction(model, test_df, user_trajs):
    pred_df = test_df[["uid", "d", "t"]].copy()
    pred_df["x_pred"] = 0
    pred_df["y_pred"] = 0

    for uid, group in tqdm(test_df.groupby("uid"), desc="Predicting"):
        if uid in user_trajs and user_trajs[uid]:
            last_known = user_trajs[uid][-1][0]
        else:
            user_known = group[group["x"] != MASK_VALUE]
            if not user_known.empty:
                last_known = tuple(user_known[["x", "y"]].iloc[0])
            else:
                last_known = (0, 0)

        current_loc = last_known
        preds = []

        for _, row in group.iterrows():
            if row["x"] == MASK_VALUE:
                pred = model.predict(uid, row["d"], row["t"], current_loc)
            else:
                pred = (row["x"], row["y"])
            preds.append(pred)
            current_loc = pred

        idxs = group.index
        pred_df.loc[idxs, "x_pred"] = [p[0] for p in preds]
        pred_df.loc[idxs, "y_pred"] = [p[1] for p in preds]

    return pred_df


In [9]:
def _geo_bleu_group(group):
    # group has columns ['d','t','x_pred','y_pred','x_gt','y_gt']
    # If only one time-slot in this group:
    if len(group) == 1:
        row = group.iloc[0]
        return 1.0 if (row['x_pred'] == row['x_gt'] and row['y_pred'] == row['y_gt']) else 0.0
    
    # Otherwise, compute standard GEO-BLEU
    p_seq = list(zip(group['d'], group['t'], group['x_pred'], group['y_pred']))
    g_seq = list(zip(group['d'], group['t'], group['x_gt'], group['y_gt']))
    return calc_geobleu_single(p_seq, g_seq)


def evaluate_geobleu(pred_df, sampled_gt_df):
    gt = sampled_gt_df.rename(columns={"x_orig":"x_gt","y_orig":"y_gt"})
    merged = pd.merge(pred_df, gt, on=["uid","d","t"])
    if merged.empty:
        print("⚠️ No matching rows in evaluation.")
        return 0.0

    # Group by uid only (per-user sequence)
    users = [g for _, g in merged.groupby("uid")]
    print(f"Merged rows: {len(merged)}, users: {len(users)}")

    with mp.Pool(mp.cpu_count() - 1) as pool:
        scores = pool.map(_geo_bleu_group, users)

    return round(float(np.mean(scores)), 5)




In [10]:
for city in CITIES:
    print(f"City {city}")
    train_data = load_training_data(city)
    test_df, gt_sample = load_test_sample(city, sample_frac=TTKNN_VALUES["SAMPLE_FRAC"])

    model = FlatTTKNNModel(
        tau=TTKNN_VALUES["TAU"],
        delta=TTKNN_VALUES["DELTA"],
        M=TTKNN_VALUES["M"],
        K=TTKNN_VALUES["K"]
    )
    model.fit(train_data)

    pred_df = run_prediction(model, test_df, train_data)
    score = evaluate_geobleu(pred_df, gt_sample)

    print(f"✅ GEO-BLEU (sampled test) = {score:.5f}")


City B
Unmasked test data: 3627062 rows


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Merged rows: 362706, users: 26994
✅ GEO-BLEU (sampled test) = 0.22456
City C
Unmasked test data: 2953708 rows


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Merged rows: 295371, users: 21996
✅ GEO-BLEU (sampled test) = 0.23485
City D
Unmasked test data: 2361882 rows


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Merged rows: 236188, users: 16997
✅ GEO-BLEU (sampled test) = 0.23771
