ALGORITHM: Flat TT-KNN for Location Prediction

INPUT: Historical trajectories, test sequences with masked locations
OUTPUT: Predicted locations for masked entries

PHASE 1: Training
FOR each user:
    1. Filter locations by frequency (≥ τ visits)
    2. Convert (day, time) → flat time segments
    3. Build transition table: segment → location → [next_locations]
    4. Store only transitions within M future segments

PHASE 2: Prediction
FOR each masked location:
    1. Get current location and time segment
    2. Look up possible next locations in future segments (1 to M)
    3. Find K nearest neighbors by Euclidean distance
    4. Return closest different location, or current if none found

PHASE 3: Evaluation
1. Sample fraction of unmasked test data
2. Mask their locations and predict
3. Calculate GEO-BLEU score against ground truth

In [1]:
# Install necessary package
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm

import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import multiprocessing as mp
from geobleu import calc_geobleu_single
import warnings
warnings.filterwarnings("ignore")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Flat TT-KNN configuration
# TTKNN_VALUES = {
#     "TAU": 5,
#     "DELTA": 30,   # 30 minutes
#     "M": 2,        # future segments
#     "K": 2,        # nearest neighbors
#     "SAMPLE_FRAC": 0.1  # fraction of unmasked test data to simulate prediction
# }

# Dataset setup
DATA_DIR = "/kaggle/input/humob-data/15313913"
CITIES = ["B", "C", "D"]  # Change to ["A", "B", "C", "D"] for all
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {"uid": "int32", "d": "int8", "t": "int8", "x": "int16", "y": "int16"}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
TEST_DAY_MAX = 75  # Added upper bound for test period
MASK_VALUE = 999
CHUNK_SIZE = 500_000


In [3]:
def euclidean_distance(loc1, loc2):
    return np.sqrt((loc1[0] - loc2[0])**2 + (loc1[1] - loc2[1])**2)

def manhattan_distance(loc1, loc2):
    return abs(loc1[0] - loc2[0]) + abs(loc1[1] - loc2[1])

def chebyshev_distance(loc1, loc2):
    return max(abs(loc1[0] - loc2[0]), abs(loc1[1] - loc2[1]))

def calculate_distance(loc1, loc2, distance_type='euclidean'):
    if distance_type == 'euclidean':
        return euclidean_distance(loc1, loc2)
    elif distance_type == 'manhattan':
        return manhattan_distance(loc1, loc2)
    elif distance_type == 'chebyshev':
        return chebyshev_distance(loc1, loc2)
    else:
        raise ValueError(f"Unknown distance type: {distance_type}")

def to_flat_segment(d, t, delta=30):
    segments_per_day = (24 * 60) // delta
    return d * segments_per_day + (t * 60) // delta


In [4]:
def build_flat_TT_index(trajectory, tau=5, delta=30):
    location_counts = Counter((x, y) for _, _, x, y in trajectory)
    traj_filtered = [(d, t, x, y) for (d, t, x, y) in trajectory if location_counts[(x, y)] >= tau]
    
    seg_traj = [(to_flat_segment(d, t, delta), (x, y)) for d, t, x, y in traj_filtered]
    seg_traj.sort()
    
    TT_index = defaultdict(lambda: defaultdict(list))
    # Also track frequency of transitions
    TT_freq = defaultdict(lambda: defaultdict(Counter))
    
    for i in range(len(seg_traj) - 1):
        seg1, loc1 = seg_traj[i]
        seg2, loc2 = seg_traj[i + 1]
        if 0 < seg2 - seg1 <= 3:  # allow up to 3-segment jumps (1.5 hours for Δ=30min)
            TT_index[seg1][loc1].append(loc2)
            TT_freq[seg1][loc1][loc2] += 1

    return TT_index, TT_freq


In [5]:
# Improved prediction function with better frequency weighting
def predict_next_location_flat(TT_index, TT_freq, d, t, current_loc, M=2, K=2, delta=30, distance_type='euclidean', freq_weight=0.3):
    curr_seg = to_flat_segment(d, t, delta)
    candidates = []
    candidate_freqs = []

    # Look ahead in future segments
    for i in range(1, M + 1):
        future_seg = curr_seg + i
        if future_seg in TT_index and current_loc in TT_index[future_seg]:
            locs = TT_index[future_seg][current_loc]
            freqs = [TT_freq[future_seg][current_loc][loc] for loc in locs]
            candidates.extend(locs)
            candidate_freqs.extend(freqs)

    if not candidates:
        return current_loc

    # Create unique candidates with aggregated frequencies
    unique_candidates = {}
    for loc, freq in zip(candidates, candidate_freqs):
        if loc in unique_candidates:
            unique_candidates[loc] += freq
        else:
            unique_candidates[loc] = freq

    # Calculate weighted scores
    scored_candidates = []
    max_freq = max(unique_candidates.values()) if unique_candidates else 1
    min_distance = float('inf')
    max_distance = 0
    
    # First pass: calculate distance range for normalization
    distances = {}
    for loc in unique_candidates:
        dist = calculate_distance(current_loc, loc, distance_type)
        distances[loc] = dist
        min_distance = min(min_distance, dist)
        max_distance = max(max_distance, dist)
    
    # Avoid division by zero
    distance_range = max_distance - min_distance if max_distance > min_distance else 1
    
    for loc, freq in unique_candidates.items():
        distance = distances[loc]
        
        # Normalize both distance and frequency to [0, 1]
        norm_distance = (distance - min_distance) / distance_range if distance_range > 0 else 0
        norm_freq = freq / max_freq
        
        # Combined score: balance between distance (lower is better) and frequency (higher is better)
        # Use exponential weighting for frequency to give more preference to frequent locations
        freq_bonus = norm_freq ** freq_weight
        weighted_score = norm_distance / freq_bonus  # Lower score is better
        
        scored_candidates.append((weighted_score, loc))

    # Sort by weighted score and return best different location
    scored_candidates.sort()
    
    for _, loc in scored_candidates[:K]:
        if loc != current_loc:
            return loc
    return current_loc

In [6]:
class FlatTTKNNModel:
    def __init__(self, tau=5, delta=30, M=2, K=2, distance_type='euclidean', freq_weight=0.3):
        self.tau = tau
        self.delta = delta
        self.M = M
        self.K = K
        self.distance_type = distance_type
        self.freq_weight = freq_weight
        self.index = {}
        self.freq_index = {}

    def fit(self, user_trajectories):
        for uid, traj in tqdm(user_trajectories.items(), desc="Building TT indices"):
            formatted = [(d, t, x, y) for (x, y), (d, t) in traj]
            self.index[uid], self.freq_index[uid] = build_flat_TT_index(formatted, self.tau, self.delta)

    def predict(self, uid, d, t, current_loc):
        if uid not in self.index:
            return current_loc
        return predict_next_location_flat(
            self.index[uid], 
            self.freq_index[uid], 
            d, t, current_loc, 
            self.M, self.K, self.delta, 
            self.distance_type, 
            self.freq_weight
        )

In [7]:
def load_training_data(city, validation_indices=None):
    """Load training data from days 1-60 and unmasked data from days 61-75 (excluding validation samples)"""
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    user_trajs = defaultdict(list)
    
    # Track validation indices to exclude from training
    validation_set = set()
    if validation_indices is not None:
        validation_set = set(validation_indices)
    
    chunk_counter = 0
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        # Training data from days 1-60
        train_early = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        
        # Additional training data from days 61-75 (unmasked, not in validation)
        test_period = chunk[(chunk["d"] >= TEST_DAY_MIN) & (chunk["d"] <= TEST_DAY_MAX) & (chunk["x"] != MASK_VALUE)]
        
        # Filter out validation samples from test period data
        if not test_period.empty and validation_indices is not None:
            # Calculate global indices for this chunk
            chunk_start_idx = chunk_counter * CHUNK_SIZE
            chunk_indices = set(range(chunk_start_idx, chunk_start_idx + len(chunk)))
            test_period_global_indices = set(test_period.index + chunk_start_idx)
            
            # Keep only test period data that's not in validation
            valid_test_indices = test_period_global_indices - validation_set
            if valid_test_indices:
                # Convert back to local chunk indices
                local_valid_indices = [idx - chunk_start_idx for idx in valid_test_indices if idx - chunk_start_idx < len(chunk)]
                if local_valid_indices:
                    train_additional = chunk.iloc[local_valid_indices]
                else:
                    train_additional = pd.DataFrame()
            else:
                train_additional = pd.DataFrame()
        else:
            train_additional = test_period
        
        # Combine training data
        combined_train = pd.concat([train_early, train_additional], ignore_index=True)
        
        # Process trajectories
        for uid, group in combined_train.groupby("uid"):
            locs = list(zip(group["x"], group["y"]))
            times = list(zip(group["d"], group["t"]))
            user_trajs[uid].extend(zip(locs, times))
        
        chunk_counter += 1
    
    print(f"Training data loaded: {sum(len(traj) for traj in user_trajs.values())} total points")
    return dict(user_trajs)

def load_validation_data(city, sample_frac=0.1, seed=42):
    """Load validation data: sample from days 61-75 unmasked data"""
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    test_parts = []
    all_indices = []
    
    chunk_counter = 0
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["d"] <= TEST_DAY_MAX) & (chunk["x"] != MASK_VALUE)
        valid_chunk = chunk[mask].copy()
        
        if not valid_chunk.empty:
            # Store global indices
            global_indices = valid_chunk.index + chunk_counter * CHUNK_SIZE
            valid_chunk['global_idx'] = global_indices
            test_parts.append(valid_chunk)
            all_indices.extend(global_indices)
        
        chunk_counter += 1
    
    if not test_parts:
        print("No unmasked test data found!")
        return pd.DataFrame(), pd.DataFrame(), []
    
    test_df = pd.concat(test_parts, ignore_index=True)
    print(f"Unmasked test data (days {TEST_DAY_MIN}-{TEST_DAY_MAX}): {len(test_df)} rows")

    # Sample for validation
    np.random.seed(seed)
    sampled_indices = np.random.choice(len(test_df), size=int(len(test_df) * sample_frac), replace=False)
    
    validation_df = test_df.iloc[sampled_indices].copy()
    validation_global_indices = validation_df['global_idx'].tolist()
    
    # Create ground truth for validation
    validation_gt = validation_df[["uid", "d", "t", "x", "y"]].copy()
    validation_gt = validation_gt.rename(columns={"x": "x_orig", "y": "y_orig"})
    
    # Mask the validation samples
    validation_df["x"] = MASK_VALUE
    validation_df["y"] = MASK_VALUE
    
    # Create remaining training data (unmasked test data not used for validation)
    remaining_indices = set(range(len(test_df))) - set(sampled_indices)
    remaining_df = test_df.iloc[list(remaining_indices)].copy()
    
    # Combine remaining data as additional test data for prediction
    full_test_df = pd.concat([remaining_df, validation_df], ignore_index=True)
    full_test_df = full_test_df.sort_values(["uid", "d", "t"]).reset_index(drop=True)
    
    print(f"Validation samples: {len(validation_df)} rows")
    print(f"Additional training from test period: {len(remaining_df)} rows")
    
    return full_test_df, validation_gt[["uid", "d", "t", "x_orig", "y_orig"]], validation_global_indices

In [8]:
def run_prediction(model, test_df, user_trajs):
    pred_df = test_df[["uid", "d", "t"]].copy()
    pred_df["x_pred"] = 0
    pred_df["y_pred"] = 0

    for uid, group in tqdm(test_df.groupby("uid"), desc="Predicting"):
        if uid in user_trajs and user_trajs[uid]:
            last_known = user_trajs[uid][-1][0]
        else:
            user_known = group[group["x"] != MASK_VALUE]
            if not user_known.empty:
                last_known = tuple(user_known[["x", "y"]].iloc[0])
            else:
                last_known = (0, 0)

        current_loc = last_known
        preds = []

        for _, row in group.iterrows():
            if row["x"] == MASK_VALUE:
                pred = model.predict(uid, row["d"], row["t"], current_loc)
            else:
                pred = (row["x"], row["y"])
            preds.append(pred)
            current_loc = pred

        idxs = group.index
        pred_df.loc[idxs, "x_pred"] = [p[0] for p in preds]
        pred_df.loc[idxs, "y_pred"] = [p[1] for p in preds]

    return pred_df


In [9]:
# def _geo_bleu_group(group):
#     """Calculate GEO-BLEU for a single user's trajectory"""
#     if len(group) <= 1:
#         if len(group) == 1:
#             row = group.iloc[0]
#             return 1.0 if (row['x_pred'] == row['x_gt'] and row['y_pred'] == row['y_gt']) else 0.0
#         return 0.0
    
#     # Sort by day and time to ensure correct sequence order
#     group_sorted = group.sort_values(['d', 't']).reset_index(drop=True)
    
#     # Format sequences as required by geobleu: (day, time, x, y)
#     pred_seq = [(row['d'], row['t'], row['x_pred'], row['y_pred']) for _, row in group_sorted.iterrows()]
#     gt_seq = [(row['d'], row['t'], row['x_gt'], row['y_gt']) for _, row in group_sorted.iterrows()]
    
#     try:
#         return calc_geobleu_single(pred_seq, gt_seq)
#     except Exception as e:
#         print(f"Warning: GEO-BLEU calculation failed: {e}")
#         return 0.0

def _geo_bleu_group(group):
    """Calculate GEO-BLEU for a single user's trajectory"""
    if len(group) <= 1:
        if len(group) == 1:
            row = group.iloc[0]
            return 1.0 if (row['x_pred'] == row['x_gt'] and row['y_pred'] == row['y_gt']) else 0.0
        return 0.0
    
    # Sort by day and time to ensure correct sequence order
    group_sorted = group.sort_values(['d', 't']).reset_index(drop=True)
    
    # Format sequences as required by geobleu: (day, time, x, y)
    pred_seq = [(row['d'], row['t'], row['x_pred'], row['y_pred']) for _, row in group_sorted.iterrows()]
    gt_seq = [(row['d'], row['t'], row['x_gt'], row['y_gt']) for _, row in group_sorted.iterrows()]
    
    try:
        return calc_geobleu_single(pred_seq, gt_seq)
    except Exception as e:
        print(f"Warning: GEO-BLEU calculation failed: {e}")
        return 0.0


def evaluate_geobleu(pred_df, sampled_gt_df):
    """Evaluate GEO-BLEU score with improved error handling"""
    gt = sampled_gt_df.rename(columns={"x_orig":"x_gt","y_orig":"y_gt"})
    merged = pd.merge(pred_df, gt, on=["uid","d","t"], how="inner")
    
    if merged.empty:
        print("⚠️ No matching rows in evaluation.")
        return 0.0

    print(f"Evaluation data: {len(merged)} rows, {merged['uid'].nunique()} users")
    
    # Group by user and calculate GEO-BLEU for each user
    user_groups = [group for _, group in merged.groupby("uid") if len(group) > 0]
    
    if not user_groups:
        print("⚠️ No valid user groups for evaluation.")
        return 0.0
    
    # Use multiprocessing for faster computation
    try:
        with mp.Pool(min(mp.cpu_count() - 1, len(user_groups))) as pool:
            scores = pool.map(_geo_bleu_group, user_groups)
    except Exception as e:
        print(f"Multiprocessing failed, using sequential processing: {e}")
        scores = [_geo_bleu_group(group) for group in tqdm(user_groups, desc="Computing GEO-BLEU")]
    
    # Filter out invalid scores
    valid_scores = [s for s in scores if not np.isnan(s) and s >= 0]
    
    if not valid_scores:
        print("⚠️ No valid GEO-BLEU scores computed.")
        return 0.0
    
    final_score = np.mean(valid_scores)
    print(f"Valid scores: {len(valid_scores)}/{len(scores)}")
    
    return round(float(final_score), 5)

In [10]:
# Updated configuration with improved parameters
TTKNN_VALUES = {
    "TAU": 0,      # Reduced threshold for more locations
    "DELTA": 30,   # 30 minutes
    "M": 3,        # Look ahead 3 segments (1.5 hours)
    "K": 2,        # Consider more candidates
    "SAMPLE_FRAC": 0.1,  # Slightly larger validation set
}

# Test different configurations with improved ranges
DISTANCE_TYPES = ['euclidean', 'manhattan', 'chebyshev']
FREQ_WEIGHTS = [0.1, 0.5, 1]  # Better range for frequency weighting

for city in CITIES:
    print(f"\n{'='*50}")
    print(f"CITY {city} EVALUATION")
    print(f"{'='*50}")
    
    # Load validation data first to get indices
    test_df, validation_gt, validation_indices = load_validation_data(city, sample_frac=TTKNN_VALUES["SAMPLE_FRAC"])
    
    # Load training data (excluding validation samples)
    train_data = load_training_data(city, validation_indices)
    
    best_score = 0
    best_config = {}
    
    for distance_type in DISTANCE_TYPES:
        for freq_weight in FREQ_WEIGHTS:
            print(f"\nTesting: {distance_type} distance, freq_weight={freq_weight}")
            
            model = FlatTTKNNModel(
                tau=TTKNN_VALUES["TAU"],
                delta=TTKNN_VALUES["DELTA"],
                M=TTKNN_VALUES["M"],
                K=TTKNN_VALUES["K"],
                distance_type=distance_type,
                freq_weight=freq_weight
            )
            
            model.fit(train_data)
            pred_df = run_prediction(model, test_df, train_data)
            score = evaluate_geobleu(pred_df, validation_gt)
            
            print(f"GEO-BLEU = {score:.5f}")
            
            if score > best_score:
                best_score = score
                best_config = {
                    'distance_type': distance_type,
                    'freq_weight': freq_weight,
                    'score': score
                }
    
    print(f"\n🏆 BEST CONFIGURATION FOR CITY {city}:")
    print(f"   Distance: {best_config.get('distance_type', 'N/A')}")
    print(f"   Frequency Weight: {best_config.get('freq_weight', 'N/A')}")
    print(f"   GEO-BLEU Score: {best_config.get('score', 0):.5f}")
    
    # Final evaluation with best configuration
    if best_config:
        print(f"\n📊 DETAILED RESULTS:")
        print(f"   Training data: Days 1-{TRAIN_DAY_MAX} + unmasked days {TEST_DAY_MIN}-{TEST_DAY_MAX} (excluding validation)")
        print(f"   Validation data: {len(validation_gt)} samples from days {TEST_DAY_MIN}-{TEST_DAY_MAX}")
        print(f"   Total users in training: {len(train_data)}")


CITY B EVALUATION
Unmasked test data (days 61-75): 3627062 rows
Validation samples: 362706 rows
Additional training from test period: 3264356 rows
Training data loaded: 14290250 total points

Testing: euclidean distance, freq_weight=0.1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: euclidean distance, freq_weight=0.5


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: euclidean distance, freq_weight=1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: manhattan distance, freq_weight=0.1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: manhattan distance, freq_weight=0.5


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: manhattan distance, freq_weight=1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: chebyshev distance, freq_weight=0.1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: chebyshev distance, freq_weight=0.5


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

Testing: chebyshev distance, freq_weight=1


Building TT indices:   0%|          | 0/30000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27000 [00:00<?, ?it/s]

Evaluation data: 362706 rows, 26994 users
Valid scores: 26994/26994
GEO-BLEU = 0.22445

🏆 BEST CONFIGURATION FOR CITY B:
   Distance: euclidean
   Frequency Weight: 0.1
   GEO-BLEU Score: 0.22445

📊 DETAILED RESULTS:
   Training data: Days 1-60 + unmasked days 61-75 (excluding validation)
   Validation data: 362706 samples from days 61-75
   Total users in training: 30000

CITY C EVALUATION
Unmasked test data (days 61-75): 2953708 rows
Validation samples: 295370 rows
Additional training from test period: 2658338 rows
Training data loaded: 11333200 total points

Testing: euclidean distance, freq_weight=0.1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: euclidean distance, freq_weight=0.5


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: euclidean distance, freq_weight=1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: manhattan distance, freq_weight=0.1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: manhattan distance, freq_weight=0.5


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: manhattan distance, freq_weight=1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: chebyshev distance, freq_weight=0.1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: chebyshev distance, freq_weight=0.5


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

Testing: chebyshev distance, freq_weight=1


Building TT indices:   0%|          | 0/25000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22000 [00:00<?, ?it/s]

Evaluation data: 295370 rows, 21996 users
Valid scores: 21996/21996
GEO-BLEU = 0.23475

🏆 BEST CONFIGURATION FOR CITY C:
   Distance: euclidean
   Frequency Weight: 0.1
   GEO-BLEU Score: 0.23475

📊 DETAILED RESULTS:
   Training data: Days 1-60 + unmasked days 61-75 (excluding validation)
   Validation data: 295370 samples from days 61-75
   Total users in training: 25000

CITY D EVALUATION
Unmasked test data (days 61-75): 2361882 rows
Validation samples: 236188 rows
Additional training from test period: 2125694 rows
Training data loaded: 9462336 total points

Testing: euclidean distance, freq_weight=0.1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: euclidean distance, freq_weight=0.5


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: euclidean distance, freq_weight=1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: manhattan distance, freq_weight=0.1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: manhattan distance, freq_weight=0.5


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: manhattan distance, freq_weight=1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: chebyshev distance, freq_weight=0.1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: chebyshev distance, freq_weight=0.5


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

Testing: chebyshev distance, freq_weight=1


Building TT indices:   0%|          | 0/20000 [00:00<?, ?it/s]

Predicting:   0%|          | 0/17000 [00:00<?, ?it/s]

Evaluation data: 236188 rows, 16997 users
Valid scores: 16997/16997
GEO-BLEU = 0.23754

🏆 BEST CONFIGURATION FOR CITY D:
   Distance: euclidean
   Frequency Weight: 0.1
   GEO-BLEU Score: 0.23754

📊 DETAILED RESULTS:
   Training data: Days 1-60 + unmasked days 61-75 (excluding validation)
   Validation data: 236188 samples from days 61-75
   Total users in training: 20000


In [11]:
# Additional analysis function
def analyze_data_distribution(city):
    """Analyze the distribution of data across different periods"""
    path = f"{DATA_DIR}/city_{city}_challengedata.csv"
    
    period_stats = {
        'training_period': {'days': f'1-{TRAIN_DAY_MAX}', 'unmasked': 0, 'masked': 0},
        'test_period': {'days': f'{TEST_DAY_MIN}-{TEST_DAY_MAX}', 'unmasked': 0, 'masked': 0}
    }
    
    for chunk in pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE):
        # Training period
        train_data = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        period_stats['training_period']['unmasked'] += len(train_data[train_data["x"] != MASK_VALUE])
        period_stats['training_period']['masked'] += len(train_data[train_data["x"] == MASK_VALUE])
        
        # Test period  
        test_data = chunk[(chunk["d"] >= TEST_DAY_MIN) & (chunk["d"] <= TEST_DAY_MAX)]
        period_stats['test_period']['unmasked'] += len(test_data[test_data["x"] != MASK_VALUE])
        period_stats['test_period']['masked'] += len(test_data[test_data["x"] == MASK_VALUE])
    
    print(f"\n📈 DATA DISTRIBUTION FOR CITY {city}:")
    for period, stats in period_stats.items():
        total = stats['unmasked'] + stats['masked']
        unmasked_pct = (stats['unmasked'] / total * 100) if total > 0 else 0
        print(f"   {period.replace('_', ' ').title()} (days {stats['days']}):")
        print(f"     Unmasked: {stats['unmasked']:,} ({unmasked_pct:.1f}%)")
        print(f"     Masked: {stats['masked']:,}")
        print(f"     Total: {total:,}")

# Run analysis before main evaluation
for city in CITIES:
    analyze_data_distribution(city)


📈 DATA DISTRIBUTION FOR CITY B:
   Training Period (days 1-60):
     Unmasked: 14,194,433 (100.0%)
     Masked: 0
     Total: 14,194,433
   Test Period (days 61-75):
     Unmasked: 3,627,062 (90.6%)
     Masked: 375,498
     Total: 4,002,560

📈 DATA DISTRIBUTION FOR CITY C:
   Training Period (days 1-60):
     Unmasked: 11,226,812 (100.0%)
     Masked: 0
     Total: 11,226,812
   Test Period (days 61-75):
     Unmasked: 2,953,708 (90.9%)
     Masked: 294,627
     Total: 3,248,335

📈 DATA DISTRIBUTION FOR CITY D:
   Training Period (days 1-60):
     Unmasked: 9,358,783 (100.0%)
     Masked: 0
     Total: 9,358,783
   Test Period (days 61-75):
     Unmasked: 2,361,882 (88.4%)
     Masked: 309,413
     Total: 2,671,295


In [12]:
# for city in CITIES:
#     print(f"City {city}")
#     train_data = load_training_data(city)
#     test_df, gt_sample = load_test_sample(city, sample_frac=TTKNN_VALUES["SAMPLE_FRAC"])

#     model = FlatTTKNNModel(
#         tau=TTKNN_VALUES["TAU"],
#         delta=TTKNN_VALUES["DELTA"],
#         M=TTKNN_VALUES["M"],
#         K=TTKNN_VALUES["K"]
#     )
#     model.fit(train_data)

#     pred_df = run_prediction(model, test_df, train_data)
#     score = evaluate_geobleu(pred_df, gt_sample)

#     print(f"✅ GEO-BLEU (sampled test) = {score:.5f}")


In [13]:
# for city in CITIES:
#     print(f"City {city}")
#     train_data = load_training_data(city)
#     test_df, gt_sample = load_test_sample(city, sample_frac=TTKNN_VALUES["SAMPLE_FRAC"])

#     model = FlatTTKNNModel(
#         tau=TTKNN_VALUES["TAU"],
#         delta=TTKNN_VALUES["DELTA"],
#         M=TTKNN_VALUES["M"],
#         K=TTKNN_VALUES["K"]
#     )
#     model.fit(train_data)

#     pred_df = run_prediction(model, test_df, train_data)
#     score = evaluate_geobleu(pred_df, gt_sample)

#     print(f"✅ GEO-BLEU (sampled test) = {score:.5f}")
