In [1]:
# Install required packages
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm scikit-learn

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import multiprocessing as mp
from geobleu import calc_geobleu_single
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Configuration
pd.set_option("display.float_format", "{:.5f}".format)
np.random.seed(42)

# Data settings
DATA_DIR = "/kaggle/input/humob-data/15313913"
CITIES = ["D"]
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {"uid": "int32", "d": "int8", "t": "int8", "x": "int16", "y": "int16"}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
MASK_VALUE = 999
CHUNK_SIZE = 500_000

# TT-Index parameters
DELTA = 30  # minutes per segment
TAU = 5     # minimum visits to keep a location
M = 2       # future segments to consider
K = 2       # nearest neighbors

In [4]:
def load_training_data(city_code):
    """Load and prepare training data for a city."""
    print(f"Loading training data for City {city_code}...")
    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    user_trajectories = defaultdict(list)
    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc="Loading training data"):
        train_chunk = chunk[chunk["d"] <= TRAIN_DAY_MAX].copy()
        if train_chunk.empty:
            continue
        train_chunk = train_chunk.sort_values(['uid', 'd', 't'])
        for uid, user_data in train_chunk.groupby('uid'):
            trajectory = []
            for _, row in user_data.iterrows():
                timestamp = int(row['d']) * 24 * 60 + int(row['t']) * 30
                trajectory.append((timestamp, int(row['x']), int(row['y'])))
            user_trajectories[uid] = trajectory
    print(f"Loaded {len(user_trajectories)} users")
    return dict(user_trajectories)

def load_test_data(city_code):
    """Load test data for a city."""
    print(f"Loading test data for City {city_code}...")
    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    test_parts = []
    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc="Loading test data"):
        mask = ((chunk["d"] >= TEST_DAY_MIN) & 
                (chunk["x"] != MASK_VALUE) & 
                (chunk["y"] != MASK_VALUE))
        test_chunk = chunk.loc[mask, COLUMNS]
        if not test_chunk.empty:
            test_parts.append(test_chunk.copy())
    test_df = pd.concat(test_parts, ignore_index=True) if test_parts else pd.DataFrame(columns=COLUMNS)
    print(f"Test data shape: {test_df.shape}")
    return test_df

In [5]:
def build_tt_index(trajectory):
    """Build TT-Index for a single user trajectory."""
    if len(trajectory) < 2:
        return defaultdict(lambda: defaultdict(list))
    location_counts = Counter((x, y) for _, x, y in trajectory)
    filtered_traj = [(t, x, y) for t, x, y in trajectory if location_counts[(x, y)] >= TAU]
    if len(filtered_traj) < 2:
        return defaultdict(lambda: defaultdict(list))
    seg_traj = [(t // DELTA, (x, y)) for t, x, y in filtered_traj]
    tt_index = defaultdict(lambda: defaultdict(list))
    segments_per_day = (24 * 60) // DELTA
    for i in range(len(seg_traj) - 1):
        seg, loc = seg_traj[i]
        next_seg, next_loc = seg_traj[i + 1]
        if next_seg == (seg + 1) % segments_per_day or next_seg == seg + 1:
            tt_index[seg][loc].append(next_loc)
    return tt_index

class TTIndexPredictor:
    """TT-Index predictor for mobility prediction."""
    def __init__(self):
        self.user_indices = {}
        self.segments_per_day = (24 * 60) // DELTA
    def fit(self, user_trajectories):
        print(f"Building TT-Index for {len(user_trajectories)} users...")
        for uid, trajectory in tqdm(user_trajectories.items(), desc="Building indices"):
            self.user_indices[uid] = build_tt_index(trajectory)
    def predict(self, uid, current_time, current_loc):
        tt_index = self.user_indices.get(uid, defaultdict(lambda: defaultdict(list)))
        curr_seg = current_time // DELTA
        candidates = []
        for s in range(1, M + 1):
            seg_idx = (curr_seg + s) % self.segments_per_day
            if current_loc in tt_index[seg_idx]:
                candidates.extend(tt_index[seg_idx][current_loc])
        if not candidates:
            return current_loc
        unique_candidates = list(dict.fromkeys(candidates))
        if len(unique_candidates) == 1:
            return unique_candidates[0]
        distances = [(np.sqrt((current_loc[0] - c[0])**2 + (current_loc[1] - c[1])**2), c) 
                    for c in unique_candidates]
        distances.sort()
        best = distances[0][1]
        if best == current_loc and len(distances) > 1:
            best = distances[1][1]
        return best

In [6]:
def _compute_geobleu(group):
    pred_seq = list(zip(group['d'], group['t'], group['x_pred'], group['y_pred']))
    true_seq = list(zip(group['d'], group['t'], group['x'], group['y']))
    return calc_geobleu_single(pred_seq, true_seq)

def evaluate_predictions(pred_df, test_df):
    merged = pd.merge(pred_df, test_df, on=['uid', 'd', 't'], how='inner')
    if merged.empty:
        return 0.0
    grouped = [group for _, group in merged.groupby('uid')]
    print(f"Evaluating {len(grouped)} users...")
    with mp.Pool(processes=max(1, mp.cpu_count() - 1)) as pool:
        scores = list(tqdm(pool.imap(_compute_geobleu, grouped), 
                          total=len(grouped), desc="Computing GEO-BLEU"))
    return float(np.mean(scores)) if scores else 0.0

def generate_predictions(test_df, user_trajectories):
    print("Generating predictions...")
    predictor = TTIndexPredictor()
    predictor.fit(user_trajectories)
    pred_df = test_df[["uid", "d", "t"]].copy()
    pred_df["x_pred"] = 0
    pred_df["y_pred"] = 0
    for uid, group in tqdm(test_df.groupby("uid"), desc="Predicting"):
        user_test = group.sort_values(['d', 't']).copy()
        for idx, row in user_test.iterrows():
            current_time = int(row['d']) * 24 * 60 + int(row['t']) * 30
            current_loc = (int(row['x']), int(row['y']))
            pred_loc = predictor.predict(uid, current_time, current_loc)
            pred_df.loc[idx, "x_pred"] = pred_loc[0]
            pred_df.loc[idx, "y_pred"] = pred_loc[1]
    return pred_df.astype({"x_pred": "int16", "y_pred": "int16"})

In [7]:
def evaluate_city(city_code):
    print(f"\n{'='*50}")
    print(f"Evaluating City {city_code}")
    print(f"{'='*50}")
    user_trajectories = load_training_data(city_code)
    test_df = load_test_data(city_code)
    pred_df = generate_predictions(test_df, user_trajectories)
    score = evaluate_predictions(pred_df, test_df)
    print(f"GEO-BLEU Score: {round(score, 5)}")
    return round(score, 5)

def run_evaluation():
    print("TT-INDEX MOBILITY PREDICTION EVALUATION")
    print(f"Parameters: DELTA={DELTA}, TAU={TAU}, M={M}, K={K}")
    all_results = {}
    for city in CITIES:
        score = evaluate_city(city)
        all_results[city] = score
    print(f"\n{'='*50}")
    print("FINAL RESULTS")
    print(f"{'='*50}")
    for city, score in all_results.items():
        print(f"City {city}: {score}")
    return all_results

In [8]:
# Run the evaluation
results = run_evaluation()

TT-INDEX MOBILITY PREDICTION EVALUATION
Parameters: DELTA=30, TAU=5, M=2, K=2

Evaluating City D
Loading training data for City D...


Loading training data: 0it [00:00, ?it/s]

Loaded 20000 users
Loading test data for City D...


Loading test data: 0it [00:00, ?it/s]

Test data shape: (2361882, 5)
Generating predictions...
Building TT-Index for 20000 users...


Building indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluating 17000 users...


Computing GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

GEO-BLEU Score: 1.0

FINAL RESULTS
City D: 1.0
