In [1]:
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import multiprocessing as mp
from geobleu import calc_geobleu_single
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import os
import json
import pickle
from datetime import datetime

# Ensure floats print with 5 decimals
pd.set_option("display.float_format", "{:.5f}".format)

# Constants
DATA_DIR = "/kaggle/input/humob-data/15313913"
# CITIES = ["A", "B", "C", "D"]
CITIES = ["B", "C", "D"]
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {
    "uid": "int32",
    "d": "int8",
    "t": "int8",
    "x": "int16",
    "y": "int16",
}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
MASK_VALUE = 999
CHUNK_SIZE = 500_000  # adjust as needed for memory/time

# Checkpoint directory
CHECKPOINT_DIR = "/kaggle/working/checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Set random seed for reproducible sampling
np.random.seed(42)


In [3]:
def save_checkpoint(data, filename, checkpoint_type="results"):
    """
    Save checkpoint data to disk.
    
    Args:
        data: Data to save (dict, DataFrame, etc.)
        filename: Name of the checkpoint file
        checkpoint_type: Type of checkpoint ('results', 'aggregates', 'coverage')
    """
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"{checkpoint_type}_{filename}")
    
    try:
        if isinstance(data, pd.DataFrame):
            data.to_pickle(f"{checkpoint_path}.pkl")
        else:
            with open(f"{checkpoint_path}.json", 'w') as f:
                json.dump(data, f, indent=2, default=str)
        
        # Also save metadata
        metadata = {
            'timestamp': datetime.now().isoformat(),
            'type': checkpoint_type,
            'filename': filename,
            'data_type': type(data).__name__
        }
        with open(f"{checkpoint_path}_meta.json", 'w') as f:
            json.dump(metadata, f, indent=2)
            
        print(f"✓ Checkpoint saved: {checkpoint_path}")
        
    except Exception as e:
        print(f"✗ Failed to save checkpoint {checkpoint_path}: {e}")

def load_checkpoint(filename, checkpoint_type="results"):
    """
    Load checkpoint data from disk.
    
    Args:
        filename: Name of the checkpoint file
        checkpoint_type: Type of checkpoint ('results', 'aggregates', 'coverage')
    
    Returns:
        Loaded data or None if not found
    """
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"{checkpoint_type}_{filename}")
    
    # Try loading pickle first (for DataFrames)
    if os.path.exists(f"{checkpoint_path}.pkl"):
        try:
            data = pd.read_pickle(f"{checkpoint_path}.pkl")
            print(f"✓ Checkpoint loaded: {checkpoint_path}.pkl")
            return data
        except Exception as e:
            print(f"✗ Failed to load pickle checkpoint: {e}")
    
    # Try loading JSON
    if os.path.exists(f"{checkpoint_path}.json"):
        try:
            with open(f"{checkpoint_path}.json", 'r') as f:
                data = json.load(f)
            print(f"✓ Checkpoint loaded: {checkpoint_path}.json")
            return data
        except Exception as e:
            print(f"✗ Failed to load JSON checkpoint: {e}")
    
    return None

def list_checkpoints():
    """List all available checkpoints."""
    if not os.path.exists(CHECKPOINT_DIR):
        print("No checkpoint directory found.")
        return
    
    checkpoints = {}
    for file in os.listdir(CHECKPOINT_DIR):
        if file.endswith('_meta.json'):
            try:
                with open(os.path.join(CHECKPOINT_DIR, file), 'r') as f:
                    metadata = json.load(f)
                checkpoints[file.replace('_meta.json', '')] = metadata
            except:
                continue
    
    if checkpoints:
        print("Available checkpoints:")
        for name, meta in sorted(checkpoints.items()):
            print(f"  {name}: {meta['type']} ({meta['timestamp']})")
    else:
        print("No checkpoints found.")

def save_train_aggregates(city_code, aggregates):
    """Save training aggregates for a city."""
    data = {
        'global_mean': aggregates[0],
        'global_mode': aggregates[1],
        'per_user_mean_df': aggregates[2],
        'per_user_mode_df': aggregates[3],
        'per_user_unigram_dict': aggregates[4],
        'per_user_bigram_dict': aggregates[5]
    }
    
    # Save DataFrames separately
    save_checkpoint(data['per_user_mean_df'], f"city_{city_code}_mean_df", "aggregates")
    save_checkpoint(data['per_user_mode_df'], f"city_{city_code}_mode_df", "aggregates")
    
    # Save other data as JSON (convert complex objects to serializable format)
    json_data = {
        'global_mean': data['global_mean'],
        'global_mode': data['global_mode'],
        'per_user_unigram_dict': {str(k): dict(v) for k, v in data['per_user_unigram_dict'].items()},
        'per_user_bigram_dict': {str(k): {str(bigram): count for bigram, count in v.items()} 
                                for k, v in data['per_user_bigram_dict'].items()}
    }
    
    save_checkpoint(json_data, f"city_{city_code}_aggregates", "aggregates")

def load_train_aggregates(city_code):
    """Load training aggregates for a city."""
    # Load JSON data
    json_data = load_checkpoint(f"city_{city_code}_aggregates", "aggregates")
    if json_data is None:
        return None
    
    # Load DataFrames
    per_user_mean_df = load_checkpoint(f"city_{city_code}_mean_df", "aggregates")
    per_user_mode_df = load_checkpoint(f"city_{city_code}_mode_df", "aggregates")
    
    if per_user_mean_df is None or per_user_mode_df is None:
        return None
    
    # Reconstruct complex objects
    per_user_unigram_dict = {
        int(k): Counter({eval(coord): count for coord, count in v.items()}) 
        for k, v in json_data['per_user_unigram_dict'].items()
    }
    
    per_user_bigram_dict = {
        int(k): Counter({eval(bigram): count for bigram, count in v.items()}) 
        for k, v in json_data['per_user_bigram_dict'].items()
    }
    
    return (
        tuple(json_data['global_mean']),
        tuple(json_data['global_mode']),
        per_user_mean_df,
        per_user_mode_df,
        per_user_unigram_dict,
        per_user_bigram_dict
    )


In [4]:
def _geobleu_for_group(group):
    """
    Given a DataFrame for one uid (with columns 'd','t','x_pred','y_pred','x_gt','y_gt'),
    compute and return its GEO-BLEU score.
    """
    pred_seq = list(zip(group['d'], group['t'], group['x_pred'], group['y_pred']))
    true_seq = list(zip(group['d'], group['t'], group['x_gt'], group['y_gt']))
    return calc_geobleu_single(pred_seq, true_seq)

def evaluate_geobleu_parallel(pred_df: pd.DataFrame, gt_df: pd.DataFrame) -> float:
    """
    - pred_df:  DataFrame with columns ['uid','d','t','x_pred','y_pred']
    - gt_df:    DataFrame with columns ['uid','d','t','x_gt','y_gt']

    Merges on ['uid','d','t'], then uses multiprocessing + tqdm to compute
    GEO-BLEU per user in parallel. Returns the average GEO-BLEU over all users.
    """
    merged = pd.merge(pred_df, gt_df, on=['uid', 'd', 't'], how='inner')
    if merged.empty:
        return 0.0

    # Rename ground-truth x,y for readability
    merged = merged.rename(columns={'x': 'x_gt', 'y': 'y_gt'})

    # Split into list of DataFrames by uid
    grouped = [grp for _, grp in merged.groupby('uid')]
    num_users = len(grouped)
    if num_users == 0:
        return 0.0

    print(f"    ▶ Evaluating GEO-BLEU on {num_users} users...")

    # Use imap_unordered + tqdm for a progress bar
    with mp.Pool(processes=max(1, mp.cpu_count() - 1)) as pool:
        results = []
        for score in tqdm(pool.imap_unordered(_geobleu_for_group, grouped),
                          total=num_users,
                          desc="      GEO-BLEU"):
            results.append(score)
    return float(np.mean(results)) if results else 0.0


In [5]:
def compute_train_aggregates(city_code: str):
    """
    Reads city_{city_code}_challengedata.csv in chunks (days 1–60) and computes:
      - global mean (gm_x, gm_y)
      - global mode (gmod_x, gmod_y)
      - per_user_mean_df: DataFrame indexed by uid, columns ['x','y']
      - per_user_mode_df: DataFrame indexed by uid, columns ['x','y']
      - per_user_unigram_dict: Dictionary uid → Counter((x,y) → frequency)
      - per_user_bigram_dict: Dictionary uid → Counter(((x1,y1), (x2,y2)) → frequency)
    """
    print(f">>> Computing train aggregates for City {city_code} ...")

    # Accumulators for global mean
    total_x = 0
    total_y = 0
    total_count = 0

    # 200×200 array for global mode counts
    global_mode_counts = np.zeros((200, 200), dtype=np.int64)

    # Per-user accumulators
    per_user_sums = defaultdict(lambda: [0, 0, 0])   # uid → [sum_x, sum_y, count]
    per_user_modes = defaultdict(Counter)           # uid → Counter((x,y) → freq)
    per_user_unigrams = defaultdict(Counter)        # uid → Counter((x,y) → freq) for unigram model
    per_user_bigrams = defaultdict(Counter)         # uid → Counter(((x1,y1), (x2,y2)) → freq)

    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")

    # Read the file in chunks
    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc=f"Loading chunks (City {city_code})"):
        # Filter training portion (days 1–60)
        train_chunk = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        if train_chunk.empty:
            continue

        xs = train_chunk["x"].to_numpy(dtype=np.int64)
        ys = train_chunk["y"].to_numpy(dtype=np.int64)

        # Update global mean accumulators
        total_x += xs.sum()
        total_y += ys.sum()
        total_count += len(train_chunk)

        # Update global mode counts (zero-based indexing)
        xi = xs - 1
        yi = ys - 1
        np.add.at(global_mode_counts, (xi, yi), 1)

        # Update per-user sums, modes, unigrams, and bigrams
        for uid, sub in train_chunk.groupby("uid"):
            arr_x = sub["x"].to_numpy(dtype=np.int64)
            arr_y = sub["y"].to_numpy(dtype=np.int64)
            per_user_sums[uid][0] += arr_x.sum()
            per_user_sums[uid][1] += arr_y.sum()
            per_user_sums[uid][2] += len(sub)

            coords = list(zip(sub["x"], sub["y"]))
            per_user_modes[uid].update(coords)
            per_user_unigrams[uid].update(coords)
            
            # Build bigrams within each user's trajectory (ordered by d, t)
            user_sub = sub.sort_values(['d', 't'])
            user_coords = list(zip(user_sub["x"], user_sub["y"]))
            if len(user_coords) > 1:
                bigrams = [(user_coords[i], user_coords[i+1]) for i in range(len(user_coords)-1)]
                per_user_bigrams[uid].update(bigrams)

        del train_chunk  # free memory

    # Compute global mean (rounded)
    gm_x = int(round(total_x / total_count))
    gm_y = int(round(total_y / total_count))

    # Compute global mode from the 200×200 matrix
    flat_idx = np.argmax(global_mode_counts)
    gmod_x = (flat_idx // 200) + 1
    gmod_y = (flat_idx % 200) + 1

    # Build per-user mean DataFrame
    user_mean_records = []
    for uid, (sx, sy, cnt) in per_user_sums.items():
        user_mean_records.append((uid, int(round(sx / cnt)), int(round(sy / cnt))))
    per_user_mean_df = (
        pd.DataFrame(user_mean_records, columns=["uid", "x", "y"])
          .set_index("uid")
          .astype("int16")
    )

    # Build per-user mode DataFrame
    user_mode_records = []
    for uid, counter in per_user_modes.items():
        (mx, my), _ = counter.most_common(1)[0]
        user_mode_records.append((uid, int(mx), int(my)))
    per_user_mode_df = (
        pd.DataFrame(user_mode_records, columns=["uid", "x", "y"])
          .set_index("uid")
          .astype("int16")
    )

    print(f"Train aggregates done: GM=({gm_x},{gm_y}), GMODE=({gmod_x},{gmod_y}), "
          f"{len(per_user_mean_df)} users' means, {len(per_user_mode_df)} users' modes, "
          f"{len(per_user_unigrams)} users' unigrams, {len(per_user_bigrams)} users' bigrams.")
    return (gm_x, gm_y), (gmod_x, gmod_y), per_user_mean_df, per_user_mode_df, dict(per_user_unigrams), dict(per_user_bigrams)


In [6]:
def build_test_dataframe(city_code: str) -> pd.DataFrame:
    """
    Reads city_{city_code}_challengedata.csv in chunks and collects only the rows
    where d ≥ 61 and x,y != 999. Returns a DataFrame [uid,d,t,x,y].
    """
    print(f">>> Building test DataFrame for City {city_code} ...")
    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    test_parts = []

    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc=f"Loading test chunks (City {city_code})"):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["x"] != MASK_VALUE) & (chunk["y"] != MASK_VALUE)
        sub = chunk.loc[mask, ["uid", "d", "t", "x", "y"]]
        if not sub.empty:
            test_parts.append(sub.copy())
        del chunk

    if test_parts:
        test_df = pd.concat(test_parts, ignore_index=True)
    else:
        test_df = pd.DataFrame(columns=["uid", "d", "t", "x", "y"]).astype(DTYPES)

    print(f"Test DataFrame built: shape = {test_df.shape}")
    return test_df


In [7]:
def generate_unigram_predictions(test_df: pd.DataFrame, per_user_unigram_dict: dict, 
                                gm_x: int, gm_y: int) -> pd.DataFrame:
    """
    Generate predictions using unigram model for each user.
    For each test point, sample from the user's location probability distribution.
    Fallback to global mean for unseen users.
    """
    print(f"Generating Unigram predictions ...")
    
    pred_unigram = test_df[["uid", "d", "t"]].copy()
    pred_unigram["x_pred"] = 0
    pred_unigram["y_pred"] = 0
    
    # Group by user for efficient processing
    for uid, group in tqdm(test_df.groupby("uid"), desc="Unigram sampling"):
        if uid in per_user_unigram_dict:
            # Get user's location distribution
            location_counter = per_user_unigram_dict[uid]
            locations = list(location_counter.keys())
            frequencies = list(location_counter.values())
            
            # Convert frequencies to probabilities
            total_freq = sum(frequencies)
            probabilities = [f / total_freq for f in frequencies]
            
            # Sample locations for all test points of this user
            num_samples = len(group)
            sampled_indices = np.random.choice(len(locations), size=num_samples, p=probabilities)
            sampled_locations = [locations[i] for i in sampled_indices]
            
            # Update predictions for this user
            mask = pred_unigram["uid"] == uid
            pred_unigram.loc[mask, "x_pred"] = [loc[0] for loc in sampled_locations]
            pred_unigram.loc[mask, "y_pred"] = [loc[1] for loc in sampled_locations]
        else:
            # Fallback to global mean for unseen users
            mask = pred_unigram["uid"] == uid
            pred_unigram.loc[mask, "x_pred"] = gm_x
            pred_unigram.loc[mask, "y_pred"] = gm_y
    
    return pred_unigram.astype({"x_pred": "int16", "y_pred": "int16"})


In [8]:
def top_p_sampling(probabilities, top_p=0.7):
    """
    Apply top-p (nucleus) sampling to probability distribution.
    Returns indices and renormalized probabilities.
    """
    # Sort probabilities in descending order
    sorted_indices = np.argsort(probabilities)[::-1]
    sorted_probs = np.array(probabilities)[sorted_indices]
    
    # Calculate cumulative probabilities
    cumulative_probs = np.cumsum(sorted_probs)
    
    # Find cutoff point where cumulative probability exceeds top_p
    cutoff_idx = np.searchsorted(cumulative_probs, top_p) + 1
    cutoff_idx = min(cutoff_idx, len(sorted_probs))
    
    # Select top-p subset
    selected_indices = sorted_indices[:cutoff_idx]
    selected_probs = sorted_probs[:cutoff_idx]
    
    # Renormalize probabilities
    selected_probs = selected_probs / selected_probs.sum()
    
    return selected_indices, selected_probs

def generate_bigram_predictions(test_df: pd.DataFrame, per_user_bigram_dict: dict,
                               per_user_unigram_dict: dict, gm_x: int, gm_y: int,
                               top_p=None) -> pd.DataFrame:
    """
    Generate predictions using bigram model for each user.
    For each test point, use the previous location to predict the next location.
    Apply top-p sampling if specified.
    Fallback to unigram model if no bigram history, then to global mean.
    """
    model_name = f"Bigram Model (top_p={top_p})" if top_p else "Bigram Model"
    print(f"Generating {model_name} predictions ...")
    
    pred_bigram = test_df[["uid", "d", "t"]].copy()
    pred_bigram["x_pred"] = 0
    pred_bigram["y_pred"] = 0
    
    # Process each user separately to maintain sequence order
    for uid, group in tqdm(test_df.groupby("uid"), desc=f"{model_name} sampling"):
        # Sort test points by day and time to maintain sequence
        user_test = group.sort_values(['d', 't']).copy()
        
        if uid in per_user_bigram_dict and per_user_bigram_dict[uid]:
            bigram_counter = per_user_bigram_dict[uid]
            
            # Get the last location from training data as starting context
            # Use the most frequent location as initial context
            if uid in per_user_unigram_dict:
                unigram_counter = per_user_unigram_dict[uid]
                prev_location = unigram_counter.most_common(1)[0][0]
            else:
                prev_location = (gm_x, gm_y)
            
            predictions = []
            
            for idx, row in user_test.iterrows():
                # Find all bigrams that start with prev_location
                next_locations = {}
                for (loc1, loc2), freq in bigram_counter.items():
                    if loc1 == prev_location:
                        next_locations[loc2] = freq
                
                if next_locations:
                    # Sample from next locations
                    locations = list(next_locations.keys())
                    frequencies = list(next_locations.values())
                    total_freq = sum(frequencies)
                    probabilities = [f / total_freq for f in frequencies]
                    
                    if top_p is not None:
                        # Apply top-p sampling
                        selected_indices, selected_probs = top_p_sampling(probabilities, top_p)
                        selected_locations = [locations[i] for i in selected_indices]
                        sampled_location = np.random.choice(len(selected_locations), p=selected_probs)
                        next_location = selected_locations[sampled_location]
                    else:
                        # Regular sampling
                        sampled_idx = np.random.choice(len(locations), p=probabilities)
                        next_location = locations[sampled_idx]
                    
                    predictions.append(next_location)
                    prev_location = next_location
                else:
                    # Fallback to unigram model
                    if uid in per_user_unigram_dict:
                        unigram_counter = per_user_unigram_dict[uid]
                        locations = list(unigram_counter.keys())
                        frequencies = list(unigram_counter.values())
                        total_freq = sum(frequencies)
                        probabilities = [f / total_freq for f in frequencies]
                        sampled_idx = np.random.choice(len(locations), p=probabilities)
                        next_location = locations[sampled_idx]
                    else:
                        next_location = (gm_x, gm_y)
                    
                    predictions.append(next_location)
                    prev_location = next_location
            
            # Update predictions for this user
            user_indices = user_test.index
            pred_bigram.loc[user_indices, "x_pred"] = [pred[0] for pred in predictions]
            pred_bigram.loc[user_indices, "y_pred"] = [pred[1] for pred in predictions]
            
        else:
            # Fallback to unigram model for users without bigram data
            if uid in per_user_unigram_dict:
                location_counter = per_user_unigram_dict[uid]
                locations = list(location_counter.keys())
                frequencies = list(location_counter.values())
                total_freq = sum(frequencies)
                probabilities = [f / total_freq for f in frequencies]
                
                num_samples = len(user_test)
                sampled_indices = np.random.choice(len(locations), size=num_samples, p=probabilities)
                sampled_locations = [locations[i] for i in sampled_indices]
                
                user_indices = user_test.index
                pred_bigram.loc[user_indices, "x_pred"] = [loc[0] for loc in sampled_locations]
                pred_bigram.loc[user_indices, "y_pred"] = [loc[1] for loc in sampled_locations]
            else:
                # Final fallback to global mean
                mask = pred_bigram["uid"] == uid
                pred_bigram.loc[mask, "x_pred"] = gm_x
                pred_bigram.loc[mask, "y_pred"] = gm_y
    
    return pred_bigram.astype({"x_pred": "int16", "y_pred": "int16"})


In [9]:
def process_city(city_code: str) -> dict:
    """
    1. Compute train aggregates (with checkpointing)
    2. Build test_df
    3. Prepare gt_df
    4. Build each baseline's pred_df (showing progress)
    5. Evaluate GEO-BLEU (with tqdm inside evaluate_geobleu_parallel)
    Returns dict: {baseline_name: GEO-BLEU score}.
    """
    print(f"\n>>> Starting City {city_code}")

    # Check if results already exist
    existing_results = load_checkpoint(f"city_{city_code}_results", "results")
    if existing_results is not None:
        print(f"Found existing results for City {city_code}, skipping...")
        return existing_results

    # 1) Train aggregates - check checkpoint first
    aggregates = load_train_aggregates(city_code)
    if aggregates is not None:
        print(f"Loaded cached train aggregates for City {city_code}")
        (gm_x, gm_y), (gmod_x, gmod_y), per_user_mean_df, per_user_mode_df, per_user_unigram_dict, per_user_bigram_dict = aggregates
    else:
        print(f"Computing train aggregates for City {city_code}")
        aggregates = compute_train_aggregates(city_code)
        (gm_x, gm_y), (gmod_x, gmod_y), per_user_mean_df, per_user_mode_df, per_user_unigram_dict, per_user_bigram_dict = aggregates
        # Save aggregates checkpoint
        save_train_aggregates(city_code, aggregates)

    # 2) Test DataFrame
    test_df = build_test_dataframe(city_code)

    # 3) Ground-truth DataFrame
    gt_df = test_df.rename(columns={"x": "x_gt", "y": "y_gt"})[["uid", "d", "t", "x_gt", "y_gt"]]

    results = {}

    # 4a) Global Mean Prediction
    print(f"City {city_code} -> Global Mean prediction ...")
    pred_gm = test_df[["uid", "d", "t"]].copy()
    pred_gm["x_pred"] = gm_x
    pred_gm["y_pred"] = gm_y
    score_gm = evaluate_geobleu_parallel(pred_gm, gt_df)
    results["Global Mean"] = round(score_gm, 5)
    print(f"Global Mean GEO-BLEU = {results['Global Mean']}")

    # 4b) Global Mode Prediction
    print(f"City {city_code} -> Global Mode prediction ...")
    pred_gmod = test_df[["uid", "d", "t"]].copy()
    pred_gmod["x_pred"] = gmod_x
    pred_gmod["y_pred"] = gmod_y
    score_gmod = evaluate_geobleu_parallel(pred_gmod, gt_df)
    results["Global Mode"] = round(score_gmod, 5)
    print(f"Global Mode GEO-BLEU = {results['Global Mode']}")

    # 4c) Per-User Mean Prediction
    print(f"City {city_code} -> Per-User Mean prediction ...")
    pred_pum = test_df[["uid", "d", "t"]].copy()
    pred_pum = pred_pum.join(per_user_mean_df, on="uid", how="left", rsuffix="_tmp")
    pred_pum = pred_pum.rename(columns={"x": "x_pred", "y": "y_pred"})
    # Fallback for unseen users
    pred_pum["x_pred"] = pred_pum["x_pred"].fillna(gm_x).astype("int16")
    pred_pum["y_pred"] = pred_pum["y_pred"].fillna(gm_y).astype("int16")
    score_pum = evaluate_geobleu_parallel(pred_pum, gt_df)
    results["Per-User Mean"] = round(score_pum, 5)
    print(f"Per-User Mean GEO-BLEU = {results['Per-User Mean']}")

    # 4d) Per-User Mode Prediction
    print(f"City {city_code} -> Per-User Mode prediction ...")
    pred_pumod = test_df[["uid", "d", "t"]].copy()
    pred_pumod = pred_pumod.join(per_user_mode_df, on="uid", how="left", rsuffix="_tmp")
    pred_pumod = pred_pumod.rename(columns={"x": "x_pred", "y": "y_pred"})
    # Fallback for unseen users
    pred_pumod["x_pred"] = pred_pumod["x_pred"].fillna(gmod_x).astype("int16")
    pred_pumod["y_pred"] = pred_pumod["y_pred"].fillna(gmod_y).astype("int16")
    score_pumod = evaluate_geobleu_parallel(pred_pumod, gt_df)
    results["Per-User Mode"] = round(score_pumod, 5)
    print(f"Per-User Mode GEO-BLEU = {results['Per-User Mode']}")

    # 4e) Unigram Model Prediction
    print(f"City {city_code} -> Unigram Model prediction ...")
    pred_unigram = generate_unigram_predictions(test_df, per_user_unigram_dict, gm_x, gm_y)
    score_unigram = evaluate_geobleu_parallel(pred_unigram, gt_df)
    results["Unigram Model"] = round(score_unigram, 5)
    print(f"Unigram Model GEO-BLEU = {results['Unigram Model']}")

    # 4f) Bigram Model Prediction
    print(f"City {city_code} -> Bigram Model prediction ...")
    pred_bigram = generate_bigram_predictions(test_df, per_user_bigram_dict, per_user_unigram_dict, gm_x, gm_y)
    score_bigram = evaluate_geobleu_parallel(pred_bigram, gt_df)
    results["Bigram Model"] = round(score_bigram, 5)
    print(f"Bigram Model GEO-BLEU = {results['Bigram Model']}")

    # 4g) Bigram Model with top_p=0.7 Prediction
    print(f"City {city_code} -> Bigram Model (top_p=0.7) prediction ...")
    pred_bigram_top_p = generate_bigram_predictions(test_df, per_user_bigram_dict, per_user_unigram_dict, gm_x, gm_y, top_p=0.7)
    score_bigram_top_p = evaluate_geobleu_parallel(pred_bigram_top_p, gt_df)
    results["Bigram Model (top_p=0.7)"] = round(score_bigram_top_p, 5)
    print(f"Bigram Model (top_p=0.7) GEO-BLEU = {results['Bigram Model (top_p=0.7)']}")

    # Save city results checkpoint
    save_checkpoint(results, f"city_{city_code}_results", "results")

    print(f"<<< Finished City {city_code} with results: {results}\n")
    return results


In [10]:
# def analyze_data_coverage(city_code: str) -> dict:
#     """
#     Analyze masked vs unmasked entries in the dataset for a given city.
#     Returns statistics about data coverage across training and test periods.
#     """
#     # Check if coverage analysis already exists
#     existing_coverage = load_checkpoint(f"city_{city_code}_coverage", "coverage")
#     if existing_coverage is not None:
#         print(f"Found existing coverage analysis for City {city_code}")
#         return existing_coverage
    
#     print(f">>> Analyzing data coverage for City {city_code} ...")
    
#     path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    
#     # Initialize counters
#     stats = {
#         'train_total': 0,
#         'train_masked': 0,
#         'train_unmasked': 0,
#         'test_total': 0,
#         'test_masked': 0,
#         'test_unmasked': 0,
#         'unique_users': set(),
#         'train_users': set(),
#         'test_users': set()
#     }
    
#     # Process data in chunks
#     for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
#                       desc=f"Analyzing coverage (City {city_code})"):
        
#         # Split into train and test
#         train_chunk = chunk[chunk["d"] <= TRAIN_DAY_MAX]
#         test_chunk = chunk[chunk["d"] >= TEST_DAY_MIN]
        
#         # Update unique users
#         stats['unique_users'].update(chunk['uid'].unique())
        
#         # Training data analysis
#         if not train_chunk.empty:
#             stats['train_total'] += len(train_chunk)
#             masked_train = (train_chunk["x"] == MASK_VALUE) | (train_chunk["y"] == MASK_VALUE)
#             stats['train_masked'] += masked_train.sum()
#             stats['train_unmasked'] += (~masked_train).sum()
#             stats['train_users'].update(train_chunk['uid'].unique())
        
#         # Test data analysis
#         if not test_chunk.empty:
#             stats['test_total'] += len(test_chunk)
#             masked_test = (test_chunk["x"] == MASK_VALUE) | (test_chunk["y"] == MASK_VALUE)
#             stats['test_masked'] += masked_test.sum()
#             stats['test_unmasked'] += (~masked_test).sum()
#             stats['test_users'].update(test_chunk['uid'].unique())
        
#         del chunk, train_chunk, test_chunk
    
#     # Convert sets to counts for JSON serialization
#     coverage_stats = {
#         'train_total': stats['train_total'],
#         'train_masked': stats['train_masked'],
#         'train_unmasked': stats['train_unmasked'],
#         'test_total': stats['test_total'],
#         'test_masked': stats['test_masked'],
#         'test_unmasked': stats['test_unmasked'],
#         'unique_users': len(stats['unique_users']),
#         'train_users': len(stats['train_users']),
#         'test_users': len(stats['test_users'])
#     }
    
#     # Calculate percentages
#     if coverage_stats['train_total'] > 0:
#         coverage_stats['train_masked_pct'] = (coverage_stats['train_masked'] / coverage_stats['train_total']) * 100
#         coverage_stats['train_unmasked_pct'] = (coverage_stats['train_unmasked'] / coverage_stats['train_total']) * 100
#     else:
#         coverage_stats['train_masked_pct'] = coverage_stats['train_unmasked_pct'] = 0
    
#     if coverage_stats['test_total'] > 0:
#         coverage_stats['test_masked_pct'] = (coverage_stats['test_masked'] / coverage_stats['test_total']) * 100
#         coverage_stats['test_unmasked_pct'] = (coverage_stats['test_unmasked'] / coverage_stats['test_total']) * 100
#     else:
#         coverage_stats['test_masked_pct'] = coverage_stats['test_unmasked_pct'] = 0
    
#     # Save coverage checkpoint
#     save_checkpoint(coverage_stats, f"city_{city_code}_coverage", "coverage")
    
#     return coverage_stats

# def print_coverage_summary(stats: dict, city_code: str):
#     """Print a formatted summary of data coverage statistics."""
#     print(f"\n=== Data Coverage Summary for City {city_code} ===")
#     print(f"Overall Statistics:")
#     print(f"   Total unique users: {stats['unique_users']:,}")
#     print(f"   Users in training: {stats['train_users']:,}")
#     print(f"   Users in testing: {stats['test_users']:,}")
    
#     print(f"\nTraining Period (Days 1-{TRAIN_DAY_MAX}):")
#     print(f"   Total entries: {stats['train_total']:,}")
#     print(f"   Unmasked entries: {stats['train_unmasked']:,} ({stats['train_unmasked_pct']:.2f}%)")
#     print(f"   Masked entries: {stats['train_masked']:,} ({stats['train_masked_pct']:.2f}%)")
    
#     print(f"\nTest Period (Days {TEST_DAY_MIN}+):")
#     print(f"   Total entries: {stats['test_total']:,}")
#     print(f"   Unmasked entries: {stats['test_unmasked']:,} ({stats['test_unmasked_pct']:.2f}%)")
#     print(f"   Masked entries: {stats['test_masked']:,} ({stats['test_masked_pct']:.2f}%)")
    
#     if stats['test_unmasked'] > 0:
#         print(f"\nEvaluation will be performed on {stats['test_unmasked']:,} test entries")
#     else:
#         print(f"\nNo unmasked test entries found!")

# # Run coverage analysis for all cities
# print("=" * 60)
# print("DATA COVERAGE ANALYSIS")
# print("=" * 60)

# # List existing checkpoints
# list_checkpoints()
# print()

# all_coverage_stats = {}
# for city in ["A", "B", "C", "D"]:
#     coverage_stats = analyze_data_coverage(city)
#     all_coverage_stats[city] = coverage_stats
#     print_coverage_summary(coverage_stats, city)
#     print()

# # Save all coverage stats
# save_checkpoint(all_coverage_stats, "all_cities_coverage", "coverage")


In [11]:
methods = ["Global Mean", "Global Mode", "Per-User Mean", "Per-User Mode", "Unigram Model", "Bigram Model", "Bigram Model (top_p=0.7)"]

# Check if final results already exist
existing_final_results = load_checkpoint("final_results_df", "results")
if existing_final_results is not None:
    print("Found existing final results, loading from checkpoint...")
    df_results = existing_final_results
else:
    print("Computing results for all cities...")
    all_scores = {method: [] for method in methods}

    for city in CITIES:
        city_scores = process_city(city)
        for method in methods:
            all_scores[method].append(city_scores[method])

    df_results = pd.DataFrame(
        all_scores,
        index=[f"City {c}" for c in CITIES]
    ).T
    df_results["Average"] = df_results.mean(axis=1)
    
    # Save final results checkpoint
    save_checkpoint(df_results, "final_results_df", "results")
    
    # Also save as JSON for easy reading
    results_dict = df_results.to_dict()
    save_checkpoint(results_dict, "final_results_dict", "results")

print("\n=== Final GEO-BLEU Scores ===")
display(df_results)

# Save a summary report
summary_report = {
    'timestamp': datetime.now().isoformat(),
    'cities_evaluated': CITIES,
    'methods_evaluated': methods,
    'best_method': df_results['Average'].idxmax(),
    'best_score': float(df_results['Average'].max()),
    'results_summary': df_results.to_dict()
}

save_checkpoint(summary_report, "evaluation_summary", "results")

print(f"\nBest performing method: {summary_report['best_method']} (Score: {summary_report['best_score']:.5f})")
print(f"All results saved to: {CHECKPOINT_DIR}")


Computing results for all cities...

>>> Starting City B
Computing train aggregates for City B
>>> Computing train aggregates for City B ...


Loading chunks (City B): 0it [00:00, ?it/s]

Train aggregates done: GM=(104,64), GMODE=(90,49), 30000 users' means, 30000 users' modes, 30000 users' unigrams, 30000 users' bigrams.
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_B_mean_df
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_B_mode_df
✗ Failed to save checkpoint /kaggle/working/checkpoints/aggregates_city_B_aggregates: keys must be str, int, float, bool or None, not tuple
>>> Building test DataFrame for City B ...


Loading test chunks (City B): 0it [00:00, ?it/s]

Test DataFrame built: shape = (3627062, 5)
City B -> Global Mean prediction ...
    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Global Mean GEO-BLEU = 0.00025
City B -> Global Mode prediction ...
    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Global Mode GEO-BLEU = 0.00419
City B -> Per-User Mean prediction ...
    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Per-User Mean GEO-BLEU = 0.01778
City B -> Per-User Mode prediction ...
    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Per-User Mode GEO-BLEU = 0.0842
City B -> Unigram Model prediction ...
Generating Unigram predictions ...


Unigram sampling:   0%|          | 0/27000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Unigram Model GEO-BLEU = 0.04105
City B -> Bigram Model prediction ...
Generating Bigram Model predictions ...


Bigram Model sampling:   0%|          | 0/27000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Bigram Model GEO-BLEU = 0.05974
City B -> Bigram Model (top_p=0.7) prediction ...
Generating Bigram Model (top_p=0.7) predictions ...


Bigram Model (top_p=0.7) sampling:   0%|          | 0/27000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 27000 users...


      GEO-BLEU:   0%|          | 0/27000 [00:00<?, ?it/s]

Bigram Model (top_p=0.7) GEO-BLEU = 0.08984
✓ Checkpoint saved: /kaggle/working/checkpoints/results_city_B_results
<<< Finished City B with results: {'Global Mean': 0.00025, 'Global Mode': 0.00419, 'Per-User Mean': 0.01778, 'Per-User Mode': 0.0842, 'Unigram Model': 0.04105, 'Bigram Model': 0.05974, 'Bigram Model (top_p=0.7)': 0.08984}


>>> Starting City C
Computing train aggregates for City C
>>> Computing train aggregates for City C ...


Loading chunks (City C): 0it [00:00, ?it/s]

Train aggregates done: GM=(96,139), GMODE=(103,133), 25000 users' means, 25000 users' modes, 25000 users' unigrams, 25000 users' bigrams.
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_C_mean_df
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_C_mode_df
✗ Failed to save checkpoint /kaggle/working/checkpoints/aggregates_city_C_aggregates: keys must be str, int, float, bool or None, not tuple
>>> Building test DataFrame for City C ...


Loading test chunks (City C): 0it [00:00, ?it/s]

Test DataFrame built: shape = (2953708, 5)
City C -> Global Mean prediction ...
    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Global Mean GEO-BLEU = 0.00107
City C -> Global Mode prediction ...
    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Global Mode GEO-BLEU = 0.00493
City C -> Per-User Mean prediction ...
    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Per-User Mean GEO-BLEU = 0.01588
City C -> Per-User Mode prediction ...
    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Per-User Mode GEO-BLEU = 0.08338
City C -> Unigram Model prediction ...
Generating Unigram predictions ...


Unigram sampling:   0%|          | 0/22000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Unigram Model GEO-BLEU = 0.03726
City C -> Bigram Model prediction ...
Generating Bigram Model predictions ...


Bigram Model sampling:   0%|          | 0/22000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Bigram Model GEO-BLEU = 0.0538
City C -> Bigram Model (top_p=0.7) prediction ...
Generating Bigram Model (top_p=0.7) predictions ...


Bigram Model (top_p=0.7) sampling:   0%|          | 0/22000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 22000 users...


      GEO-BLEU:   0%|          | 0/22000 [00:00<?, ?it/s]

Bigram Model (top_p=0.7) GEO-BLEU = 0.08002
✓ Checkpoint saved: /kaggle/working/checkpoints/results_city_C_results
<<< Finished City C with results: {'Global Mean': 0.00107, 'Global Mode': 0.00493, 'Per-User Mean': 0.01588, 'Per-User Mode': 0.08338, 'Unigram Model': 0.03726, 'Bigram Model': 0.0538, 'Bigram Model (top_p=0.7)': 0.08002}


>>> Starting City D
Computing train aggregates for City D
>>> Computing train aggregates for City D ...


Loading chunks (City D): 0it [00:00, ?it/s]

Train aggregates done: GM=(110,91), GMODE=(142,107), 20000 users' means, 20000 users' modes, 20000 users' unigrams, 20000 users' bigrams.
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_D_mean_df
✓ Checkpoint saved: /kaggle/working/checkpoints/aggregates_city_D_mode_df
✗ Failed to save checkpoint /kaggle/working/checkpoints/aggregates_city_D_aggregates: keys must be str, int, float, bool or None, not tuple
>>> Building test DataFrame for City D ...


Loading test chunks (City D): 0it [00:00, ?it/s]

Test DataFrame built: shape = (2361882, 5)
City D -> Global Mean prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Global Mean GEO-BLEU = 1e-05
City D -> Global Mode prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Global Mode GEO-BLEU = 0.00191
City D -> Per-User Mean prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Per-User Mean GEO-BLEU = 0.02026
City D -> Per-User Mode prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Per-User Mode GEO-BLEU = 0.08866
City D -> Unigram Model prediction ...
Generating Unigram predictions ...


Unigram sampling:   0%|          | 0/17000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Unigram Model GEO-BLEU = 0.04309
City D -> Bigram Model prediction ...
Generating Bigram Model predictions ...


Bigram Model sampling:   0%|          | 0/17000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Bigram Model GEO-BLEU = 0.06066
City D -> Bigram Model (top_p=0.7) prediction ...
Generating Bigram Model (top_p=0.7) predictions ...


Bigram Model (top_p=0.7) sampling:   0%|          | 0/17000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 17000 users...


      GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

Bigram Model (top_p=0.7) GEO-BLEU = 0.09044
✓ Checkpoint saved: /kaggle/working/checkpoints/results_city_D_results
<<< Finished City D with results: {'Global Mean': 1e-05, 'Global Mode': 0.00191, 'Per-User Mean': 0.02026, 'Per-User Mode': 0.08866, 'Unigram Model': 0.04309, 'Bigram Model': 0.06066, 'Bigram Model (top_p=0.7)': 0.09044}

✓ Checkpoint saved: /kaggle/working/checkpoints/results_final_results_df
✓ Checkpoint saved: /kaggle/working/checkpoints/results_final_results_dict

=== Final GEO-BLEU Scores ===


Unnamed: 0,City B,City C,City D,Average
Global Mean,0.00025,0.00107,1e-05,0.00044
Global Mode,0.00419,0.00493,0.00191,0.00368
Per-User Mean,0.01778,0.01588,0.02026,0.01797
Per-User Mode,0.0842,0.08338,0.08866,0.08541
Unigram Model,0.04105,0.03726,0.04309,0.04047
Bigram Model,0.05974,0.0538,0.06066,0.05807
Bigram Model (top_p=0.7),0.08984,0.08002,0.09044,0.08677


✓ Checkpoint saved: /kaggle/working/checkpoints/results_evaluation_summary

Best performing method: Bigram Model (top_p=0.7) (Score: 0.08677)
All results saved to: /kaggle/working/checkpoints


In [12]:
# Utility cell for checkpoint management
def clean_checkpoints(keep_final=True):
    """Clean up checkpoint files, optionally keeping final results."""
    import shutil
    
    if os.path.exists(CHECKPOINT_DIR):
        if keep_final and os.path.exists(os.path.join(CHECKPOINT_DIR, "results_final_results_df.pkl")):
            # Keep only final results
            final_files = []
            for file in os.listdir(CHECKPOINT_DIR):
                if "final_results" in file or "evaluation_summary" in file:
                    final_files.append(file)
            
            # Create temp directory and move final files
            temp_dir = f"{CHECKPOINT_DIR}_temp"
            os.makedirs(temp_dir, exist_ok=True)
            for file in final_files:
                shutil.move(os.path.join(CHECKPOINT_DIR, file), os.path.join(temp_dir, file))
            
            # Remove original directory and rename temp
            shutil.rmtree(CHECKPOINT_DIR)
            shutil.move(temp_dir, CHECKPOINT_DIR)
            
            print(f"Cleaned checkpoints, kept final results in {CHECKPOINT_DIR}")
        else:
            shutil.rmtree(CHECKPOINT_DIR)
            print(f"Removed all checkpoints from {CHECKPOINT_DIR}")
    else:
        print("No checkpoint directory found.")

def get_checkpoint_summary():
    """Get a summary of all checkpoints."""
    if not os.path.exists(CHECKPOINT_DIR):
        print("No checkpoint directory found.")
        return
    
    files = os.listdir(CHECKPOINT_DIR)
    checkpoints_by_type = defaultdict(list)
    
    for file in files:
        if not file.endswith('_meta.json'):
            continue
        
        try:
            with open(os.path.join(CHECKPOINT_DIR, file), 'r') as f:
                metadata = json.load(f)
            checkpoints_by_type[metadata['type']].append({
                'name': file.replace('_meta.json', ''),
                'timestamp': metadata['timestamp'],
                'data_type': metadata.get('data_type', 'unknown')
            })
        except:
            continue
    
    print("Checkpoint Summary:")
    for checkpoint_type, checkpoints in checkpoints_by_type.items():
        print(f"\n{checkpoint_type.upper()} ({len(checkpoints)} files):")
        for cp in sorted(checkpoints, key=lambda x: x['timestamp']):
            print(f"  • {cp['name']} ({cp['data_type']}) - {cp['timestamp']}")

# Uncomment to run checkpoint utilities
# get_checkpoint_summary()
# clean_checkpoints(keep_final=True)
