ALGORITHM: Flat TT-KNN for Location Prediction

INPUT: Historical trajectories, test sequences with masked locations
OUTPUT: Predicted locations for masked entries

PHASE 1: Training
FOR each user:
    1. Filter locations by frequency (≥ τ visits)
    2. Convert (day, time) → flat time segments
    3. Build transition table: segment → location → [next_locations]
    4. Store only transitions within M future segments

PHASE 2: Prediction
FOR each masked location:
    1. Get current location and time segment
    2. Look up possible next locations in future segments (1 to M)
    3. Find K nearest neighbors by Euclidean distance
    4. Return closest different location, or current if none found

PHASE 3: Evaluation
1. Sample fraction of unmasked test data
2. Mask their locations and predict
3. Calculate GEO-BLEU score against ground truth

In [None]:
# Install necessary package
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm

import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import multiprocessing as mp
from geobleu import calc_geobleu_single
import warnings
warnings.filterwarnings("ignore")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Flat TT-KNN configuration
# TTKNN_VALUES = {
#     "TAU": 5,
#     "DELTA": 30,   # 30 minutes
#     "M": 2,        # future segments
#     "K": 2,        # nearest neighbors
#     "SAMPLE_FRAC": 0.1  # fraction of unmasked test data to simulate prediction
# }

# Dataset setup
DATA_DIR = "/kaggle/input/humob-data/15313913"
CITIES = ["A"]  # Change to ["A", "B", "C", "D"] for all
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {"uid": "int32", "d": "int8", "t": "int8", "x": "int16", "y": "int16"}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
TEST_DAY_MAX = 75  # Added upper bound for test period
MASK_VALUE = 999
CHUNK_SIZE = 500_000


In [None]:
def euclidean_distance(loc1, loc2):
    return np.sqrt((loc1[0] - loc2[0])**2 + (loc1[1] - loc2[1])**2)

def manhattan_distance(loc1, loc2):
    return abs(loc1[0] - loc2[0]) + abs(loc1[1] - loc2[1])

def chebyshev_distance(loc1, loc2):
    return max(abs(loc1[0] - loc2[0]), abs(loc1[1] - loc2[1]))

def calculate_distance(loc1, loc2, distance_type='euclidean'):
    if distance_type == 'euclidean':
        return euclidean_distance(loc1, loc2)
    elif distance_type == 'manhattan':
        return manhattan_distance(loc1, loc2)
    elif distance_type == 'chebyshev':
        return chebyshev_distance(loc1, loc2)
    else:
        raise ValueError(f"Unknown distance type: {distance_type}")

def to_flat_segment(d, t, delta=30):
    segments_per_day = (24 * 60) // delta
    return d * segments_per_day + (t * 60) // delta


In [None]:
def build_flat_TT_index(trajectory, tau=5, delta=30):
    location_counts = Counter((x, y) for _, _, x, y in trajectory)
    traj_filtered = [(d, t, x, y) for (d, t, x, y) in trajectory if location_counts[(x, y)] >= tau]
    
    seg_traj = [(to_flat_segment(d, t, delta), (x, y)) for d, t, x, y in traj_filtered]
    seg_traj.sort()
    
    TT_index = defaultdict(lambda: defaultdict(list))
    # Also track frequency of transitions
    TT_freq = defaultdict(lambda: defaultdict(Counter))
    
    for i in range(len(seg_traj) - 1):
        seg1, loc1 = seg_traj[i]
        seg2, loc2 = seg_traj[i + 1]
        if 0 < seg2 - seg1 <= 3:  # allow up to 3-segment jumps (1.5 hours for Δ=30min)
            TT_index[seg1][loc1].append(loc2)
            TT_freq[seg1][loc1][loc2] += 1

    return TT_index, TT_freq


In [None]:
# Improved prediction function with better frequency weighting
def predict_next_location_flat(TT_index, TT_freq, d, t, current_loc, M=2, K=2, delta=30, distance_type='euclidean', freq_weight=0.3):
    curr_seg = to_flat_segment(d, t, delta)
    candidates = []
    candidate_freqs = []

    # Look ahead in future segments
    for i in range(1, M + 1):
        future_seg = curr_seg + i
        if future_seg in TT_index and current_loc in TT_index[future_seg]:
            locs = TT_index[future_seg][current_loc]
            freqs = [TT_freq[future_seg][current_loc][loc] for loc in locs]
            candidates.extend(locs)
            candidate_freqs.extend(freqs)

    if not candidates:
        return current_loc

    # Create unique candidates with aggregated frequencies
    unique_candidates = {}
    for loc, freq in zip(candidates, candidate_freqs):
        if loc in unique_candidates:
            unique_candidates[loc] += freq
        else:
            unique_candidates[loc] = freq

    # Calculate weighted scores
    scored_candidates = []
    max_freq = max(unique_candidates.values()) if unique_candidates else 1
    min_distance = float('inf')
    max_distance = 0
    
    # First pass: calculate distance range for normalization
    distances = {}
    for loc in unique_candidates:
        dist = calculate_distance(current_loc, loc, distance_type)
        distances[loc] = dist
        min_distance = min(min_distance, dist)
        max_distance = max(max_distance, dist)
    
    # Avoid division by zero
    distance_range = max_distance - min_distance if max_distance > min_distance else 1
    
    for loc, freq in unique_candidates.items():
        distance = distances[loc]
        
        # Normalize both distance and frequency to [0, 1]
        norm_distance = (distance - min_distance) / distance_range if distance_range > 0 else 0
        norm_freq = freq / max_freq
        
        # Combined score: balance between distance (lower is better) and frequency (higher is better)
        # Use exponential weighting for frequency to give more preference to frequent locations
        freq_bonus = norm_freq ** freq_weight
        weighted_score = norm_distance / freq_bonus  # Lower score is better
        
        scored_candidates.append((weighted_score, loc))

    # Sort by weighted score and return best different location
    scored_candidates.sort()
    
    for _, loc in scored_candidates[:K]:
        if loc != current_loc:
            return loc
    return current_loc

In [None]:
class FlatTTKNNModel:
    def __init__(self, tau=5, delta=30, M=2, K=2, distance_type='euclidean', freq_weight=0.3):
        self.tau = tau
        self.delta = delta
        self.M = M
        self.K = K
        self.distance_type = distance_type
        self.freq_weight = freq_weight
        self.index = {}
        self.freq_index = {}

    def fit(self, user_trajectories):
        for uid, traj in tqdm(user_trajectories.items(), desc="Building TT indices"):
            formatted = [(d, t, x, y) for (x, y), (d, t) in traj]
            self.index[uid], self.freq_index[uid] = build_flat_TT_index(formatted, self.tau, self.delta)

    def predict(self, uid, d, t, current_loc):
        if uid not in self.index:
            return current_loc
        return predict_next_location_flat(
            self.index[uid], 
            self.freq_index[uid], 
            d, t, current_loc, 
            self.M, self.K, self.delta, 
            self.distance_type, 
            self.freq_weight
        )

In [None]:
def load_training_data(city, validation_indices=None):
    """Load training data from days 1-60 and unmasked data from days 61-75 (excluding validation samples)"""
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    user_trajs = defaultdict(list)
    
    # Track validation indices to exclude from training
    validation_set = set()
    if validation_indices is not None:
        validation_set = set(validation_indices)
    
    chunk_counter = 0
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        # Training data from days 1-60
        train_early = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        
        # Additional training data from days 61-75 (unmasked, not in validation)
        test_period = chunk[(chunk["d"] >= TEST_DAY_MIN) & (chunk["d"] <= TEST_DAY_MAX) & (chunk["x"] != MASK_VALUE)]
        
        # Filter out validation samples from test period data
        if not test_period.empty and validation_indices is not None:
            # Calculate global indices for this chunk
            chunk_start_idx = chunk_counter * CHUNK_SIZE
            chunk_indices = set(range(chunk_start_idx, chunk_start_idx + len(chunk)))
            test_period_global_indices = set(test_period.index + chunk_start_idx)
            
            # Keep only test period data that's not in validation
            valid_test_indices = test_period_global_indices - validation_set
            if valid_test_indices:
                # Convert back to local chunk indices
                local_valid_indices = [idx - chunk_start_idx for idx in valid_test_indices if idx - chunk_start_idx < len(chunk)]
                if local_valid_indices:
                    train_additional = chunk.iloc[local_valid_indices]
                else:
                    train_additional = pd.DataFrame()
            else:
                train_additional = pd.DataFrame()
        else:
            train_additional = test_period
        
        # Combine training data
        combined_train = pd.concat([train_early, train_additional], ignore_index=True)
        
        # Process trajectories
        for uid, group in combined_train.groupby("uid"):
            locs = list(zip(group["x"], group["y"]))
            times = list(zip(group["d"], group["t"]))
            user_trajs[uid].extend(zip(locs, times))
        
        chunk_counter += 1
    
    print(f"Training data loaded: {sum(len(traj) for traj in user_trajs.values())} total points")
    return dict(user_trajs)

def load_validation_data(city, sample_frac=0.1, seed=42):
    """Load validation data: sample from days 61-75 unmasked data"""
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    test_parts = []
    all_indices = []
    
    chunk_counter = 0
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["d"] <= TEST_DAY_MAX) & (chunk["x"] != MASK_VALUE)
        valid_chunk = chunk[mask].copy()
        
        if not valid_chunk.empty:
            # Store global indices
            global_indices = valid_chunk.index + chunk_counter * CHUNK_SIZE
            valid_chunk['global_idx'] = global_indices
            test_parts.append(valid_chunk)
            all_indices.extend(global_indices)
        
        chunk_counter += 1
    
    if not test_parts:
        print("No unmasked test data found!")
        return pd.DataFrame(), pd.DataFrame(), []
    
    test_df = pd.concat(test_parts, ignore_index=True)
    print(f"Unmasked test data (days {TEST_DAY_MIN}-{TEST_DAY_MAX}): {len(test_df)} rows")

    # Sample for validation
    np.random.seed(seed)
    sampled_indices = np.random.choice(len(test_df), size=int(len(test_df) * sample_frac), replace=False)
    
    validation_df = test_df.iloc[sampled_indices].copy()
    validation_global_indices = validation_df['global_idx'].tolist()
    
    # Create ground truth for validation
    validation_gt = validation_df[["uid", "d", "t", "x_orig", "y_orig"]].copy()
    validation_gt = validation_gt.rename(columns={"x": "x_orig", "y": "y_orig"})
    
    # Mask the validation samples
    validation_df["x"] = MASK_VALUE
    validation_df["y"] = MASK_VALUE
    
    # Create remaining training data (unmasked test data not used for validation)
    remaining_indices = set(range(len(test_df))) - set(sampled_indices)
    remaining_df = test_df.iloc[list(remaining_indices)].copy()
    
    # Combine remaining data as additional test data for prediction
    full_test_df = pd.concat([remaining_df, validation_df], ignore_index=True)
    full_test_df = full_test_df.sort_values(["uid", "d", "t"]).reset_index(drop=True)
    
    print(f"Validation samples: {len(validation_df)} rows")
    print(f"Additional training from test period: {len(remaining_df)} rows")
    
    return full_test_df, validation_gt[["uid", "d", "t", "x_orig", "y_orig"]], validation_global_indices

In [None]:
def run_prediction(model, test_df, user_trajs):
    pred_df = test_df[["uid", "d", "t"]].copy()
    pred_df["x_pred"] = 0
    pred_df["y_pred"] = 0

    for uid, group in tqdm(test_df.groupby("uid"), desc="Predicting"):
        if uid in user_trajs and user_trajs[uid]:
            last_known = user_trajs[uid][-1][0]
        else:
            user_known = group[group["x"] != MASK_VALUE]
            if not user_known.empty:
                last_known = tuple(user_known[["x", "y"]].iloc[0])
            else:
                last_known = (0, 0)

        current_loc = last_known
        preds = []

        for _, row in group.iterrows():
            if row["x"] == MASK_VALUE:
                pred = model.predict(uid, row["d"], row["t"], current_loc)
            else:
                pred = (row["x"], row["y"])
            preds.append(pred)
            current_loc = pred

        idxs = group.index
        pred_df.loc[idxs, "x_pred"] = [p[0] for p in preds]
        pred_df.loc[idxs, "y_pred"] = [p[1] for p in preds]

    return pred_df


In [None]:
# Updated configuration with improved parameters
TTKNN_VALUES = {
    "TAU": 0,      # Reduced threshold for more locations
    "DELTA": 30,   # 30 minutes
    "M": 3,        # Look ahead 3 segments (1.5 hours)
    "K": 2,        # Consider more candidates
}

# Use only the first configuration to save memory
DISTANCE_TYPE = 'euclidean'
FREQ_WEIGHT = 0.5

for city in CITIES:
    print(f"\n{'='*50}")
    print(f"CITY {city} EVALUATION")
    print(f"{'='*50}")

    # Path to the data file
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    
    # Process by reading the entire dataset once and organizing by user
    print("Loading and organizing data by user...")
    
    # Store all unmasked (training) and masked (test) data by user
    user_train_data = defaultdict(list)
    user_test_data = defaultdict(list)
    
    # Read data in chunks to manage memory
    chunk_count = 0
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        chunk_count += 1
        print(f"Processing chunk {chunk_count}...")
        
        # Split based ONLY on masked status (regardless of day)
        # Training data: All unmasked data (x != MASK_VALUE) from any day
        # Test data: All masked data (x == MASK_VALUE) from any day
        train_chunk = chunk[chunk["x"] != MASK_VALUE]
        test_chunk = chunk[chunk["x"] == MASK_VALUE]
        
        # Process training data and collect by user
        for uid, group in train_chunk.groupby("uid"):
            locs = list(zip(group["x"], group["y"]))
            times = list(zip(group["d"], group["t"]))
            user_train_data[uid].extend(zip(locs, times))
        
        # Collect test data by user (ONLY masked data)
        for uid, group in test_chunk.groupby("uid"):
            user_test_data[uid].append(group)
    
    print(f"Training data loaded: {sum(len(traj) for traj in user_train_data.values())} points from all days")
    print(f"Test data collected for {len(user_test_data)} users from all days")
    
    # Initialize empty list for all predictions
    all_predictions = []
    
    # Improved user-by-user processing to build models and predict on-the-fly
    print("Building models and making predictions user-by-user...")
    
    # Process each user with test data to predict
    user_count = 0
    total_users = len(user_test_data)
    
    for uid in tqdm(user_test_data.keys(), desc="Processing users"):
        user_count += 1
        
        # Skip users with no test data (shouldn't happen since we're iterating over test data keys)
        if len(user_test_data[uid]) == 0:
            continue
        
        # Combine all test data chunks for this user
        test_df = pd.concat(user_test_data[uid], ignore_index=True)
        
        # EXPLICIT CHECK: Make sure we're only working with masked data
        test_df = test_df[test_df["x"] == MASK_VALUE]
        
        # Skip if there's no masked data for this user
        if test_df.empty:
            continue
        
        # Build model for this user if they have training data
        if uid in user_train_data and user_train_data[uid]:
            # Format training data and build indices for this user
            formatted = [(d, t, x, y) for (x, y), (d, t) in user_train_data[uid]]
            user_index, user_freq_index = build_flat_TT_index(
                formatted,
                tau=TTKNN_VALUES["TAU"],
                delta=TTKNN_VALUES["DELTA"]
            )
            
            # Get last known location
            last_known = user_train_data[uid][-1][0]
        else:
            # No training data, initialize empty indices and default location
            user_index = defaultdict(lambda: defaultdict(list))
            user_freq_index = defaultdict(lambda: defaultdict(Counter))
            last_known = (0, 0)
        
        # Make predictions for this user
        current_loc = last_known
        
        # Sort test data by day and time to ensure sequential prediction
        test_rows_sorted = test_df.sort_values(["d", "t"])
        
        # Predict each masked location
        for _, row in test_rows_sorted.iterrows():
            d, t = row["d"], row["t"]
            
            # Predict using this user's model
            pred = predict_next_location_flat(
                user_index, 
                user_freq_index,
                d, t, current_loc,
                M=TTKNN_VALUES["M"],
                K=TTKNN_VALUES["K"],
                delta=TTKNN_VALUES["DELTA"],
                distance_type=DISTANCE_TYPE,
                freq_weight=FREQ_WEIGHT
            )
            
            # Add prediction to results (saving ONLY masked data predictions)
            all_predictions.append({
                "uid": uid,
                "d": d,
                "t": t,
                "x": pred[0],
                "y": pred[1]
            })
            
            # Update current location for next prediction
            current_loc = pred
        
        # Report progress periodically
        if user_count % 100 == 0 or user_count == total_users:
            print(f"Processed {user_count}/{total_users} users, {len(all_predictions)} predictions so far")
        
        # Clear memory for this user
        del test_df, user_index, user_freq_index
    
    # Convert all predictions to dataframe
    if all_predictions:
        pred_df = pd.DataFrame(all_predictions)
        print(f"Total predictions: {len(pred_df)}")
        
        # Save predictions in same format as input
        output_file = f"city_{city}_ttknn_predictions.csv"
        pred_df.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    else:
        print("No predictions made!")

# All metric calculation code is commented out