In [1]:
%pip install -q git+https://github.com/yahoojapan/geobleu.git tqdm


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for geobleu (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import multiprocessing as mp
from geobleu import calc_geobleu_single
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import os

# Ensure floats print with 5 decimals
pd.set_option("display.float_format", "{:.5f}".format)

# Constants
DATA_DIR = "/kaggle/input/humob-data/15313913"
# CITIES = ["A", "B", "C", "D"]
CITIES = ["D"]
COLUMNS = ["uid", "d", "t", "x", "y"]
DTYPES = {
    "uid": "int32",
    "d": "int8",
    "t": "int8",
    "x": "int16",
    "y": "int16",
}
TRAIN_DAY_MAX = 60
TEST_DAY_MIN = 61
MASK_VALUE = 999
CHUNK_SIZE = 500_000  # adjust as needed for memory/time

# Set random seed for reproducible sampling
np.random.seed(42)


In [3]:
def _geobleu_for_group(group):
    """
    Given a DataFrame for one uid (with columns 'd','t','x_pred','y_pred','x_gt','y_gt'),
    compute and return its GEO-BLEU score.
    """
    pred_seq = list(zip(group['d'], group['t'], group['x_pred'], group['y_pred']))
    true_seq = list(zip(group['d'], group['t'], group['x_gt'], group['y_gt']))
    return calc_geobleu_single(pred_seq, true_seq)

def evaluate_geobleu_parallel(pred_df: pd.DataFrame, gt_df: pd.DataFrame) -> float:
    """
    - pred_df:  DataFrame with columns ['uid','d','t','x_pred','y_pred']
    - gt_df:    DataFrame with columns ['uid','d','t','x_gt','y_gt']

    Merges on ['uid','d','t'], then uses multiprocessing + tqdm to compute
    GEO-BLEU per user in parallel. Returns the average GEO-BLEU over all users.
    """
    merged = pd.merge(pred_df, gt_df, on=['uid', 'd', 't'], how='inner')
    if merged.empty:
        return 0.0

    # Rename ground-truth x,y for readability
    merged = merged.rename(columns={'x': 'x_gt', 'y': 'y_gt'})

    # Split into list of DataFrames by uid
    grouped = [grp for _, grp in merged.groupby('uid')]
    num_users = len(grouped)
    if num_users == 0:
        return 0.0

    print(f"    ▶ Evaluating GEO-BLEU on {num_users} users...")

    # Use imap_unordered + tqdm for a progress bar
    with mp.Pool(processes=max(1, mp.cpu_count() - 1)) as pool:
        results = []
        for score in tqdm(pool.imap_unordered(_geobleu_for_group, grouped),
                          total=num_users,
                          desc="      ⏳ GEO-BLEU"):
            results.append(score)
    return float(np.mean(results)) if results else 0.0


In [None]:
def compute_train_aggregates(city_code: str):
    """
    Reads city_{city_code}_challengedata.csv in chunks (days 1–60) and computes:
      - global mean (gm_x, gm_y)
      - global mode (gmod_x, gmod_y)
      - per_user_mean_df: DataFrame indexed by uid, columns ['x','y']
      - per_user_mode_df: DataFrame indexed by uid, columns ['x','y']
      - per_user_unigram_dict: Dictionary uid → Counter((x,y) → frequency)
      - per_user_bigram_dict: Dictionary uid → Counter(((x1,y1), (x2,y2)) → frequency)
    """
    print(f">>> Computing train aggregates for City {city_code} ...")

    # Accumulators for global mean
    total_x = 0
    total_y = 0
    total_count = 0

    # 200×200 array for global mode counts
    global_mode_counts = np.zeros((200, 200), dtype=np.int64)

    # Per-user accumulators
    per_user_sums = defaultdict(lambda: [0, 0, 0])   # uid → [sum_x, sum_y, count]
    per_user_modes = defaultdict(Counter)           # uid → Counter((x,y) → freq)
    per_user_unigrams = defaultdict(Counter)        # uid → Counter((x,y) → freq) for unigram model
    per_user_bigrams = defaultdict(Counter)         # uid → Counter(((x1,y1), (x2,y2)) → freq)

    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")

    # Read the file in chunks
    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc=f"Loading chunks (City {city_code})"):
        # Filter training portion (days 1–60)
        train_chunk = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        if train_chunk.empty:
            continue

        xs = train_chunk["x"].to_numpy(dtype=np.int64)
        ys = train_chunk["y"].to_numpy(dtype=np.int64)

        # Update global mean accumulators
        total_x += xs.sum()
        total_y += ys.sum()
        total_count += len(train_chunk)

        # Update global mode counts (zero-based indexing)
        xi = xs - 1
        yi = ys - 1
        np.add.at(global_mode_counts, (xi, yi), 1)

        # Update per-user sums, modes, unigrams, and bigrams
        for uid, sub in train_chunk.groupby("uid"):
            arr_x = sub["x"].to_numpy(dtype=np.int64)
            arr_y = sub["y"].to_numpy(dtype=np.int64)
            per_user_sums[uid][0] += arr_x.sum()
            per_user_sums[uid][1] += arr_y.sum()
            per_user_sums[uid][2] += len(sub)

            coords = list(zip(sub["x"], sub["y"]))
            per_user_modes[uid].update(coords)
            per_user_unigrams[uid].update(coords)
            
            # Build bigrams within each user's trajectory (ordered by d, t)
            user_sub = sub.sort_values(['d', 't'])
            user_coords = list(zip(user_sub["x"], user_sub["y"]))
            if len(user_coords) > 1:
                bigrams = [(user_coords[i], user_coords[i+1]) for i in range(len(user_coords)-1)]
                per_user_bigrams[uid].update(bigrams)

        del train_chunk  # free memory

    # Compute global mean (rounded)
    gm_x = int(round(total_x / total_count))
    gm_y = int(round(total_y / total_count))

    # Compute global mode from the 200×200 matrix
    flat_idx = np.argmax(global_mode_counts)
    gmod_x = (flat_idx // 200) + 1
    gmod_y = (flat_idx % 200) + 1

    # Build per-user mean DataFrame
    user_mean_records = []
    for uid, (sx, sy, cnt) in per_user_sums.items():
        user_mean_records.append((uid, int(round(sx / cnt)), int(round(sy / cnt))))
    per_user_mean_df = (
        pd.DataFrame(user_mean_records, columns=["uid", "x", "y"])
          .set_index("uid")
          .astype("int16")
    )

    # Build per-user mode DataFrame
    user_mode_records = []
    for uid, counter in per_user_modes.items():
        (mx, my), _ = counter.most_common(1)[0]
        user_mode_records.append((uid, int(mx), int(my)))
    per_user_mode_df = (
        pd.DataFrame(user_mode_records, columns=["uid", "x", "y"])
          .set_index("uid")
          .astype("int16")
    )

    print(f"Train aggregates done: GM=({gm_x},{gm_y}), GMODE=({gmod_x},{gmod_y}), "
          f"{len(per_user_mean_df)} users' means, {len(per_user_mode_df)} users' modes, "
          f"{len(per_user_unigrams)} users' unigrams, {len(per_user_bigrams)} users' bigrams.")
    return (gm_x, gm_y), (gmod_x, gmod_y), per_user_mean_df, per_user_mode_df, dict(per_user_unigrams), dict(per_user_bigrams)


In [None]:
def build_test_dataframe(city_code: str) -> pd.DataFrame:
    """
    Reads city_{city_code}_challengedata.csv in chunks and collects only the rows
    where d ≥ 61 and x,y != 999. Returns a DataFrame [uid,d,t,x,y].
    """
    print(f">>> Building test DataFrame for City {city_code} ...")
    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    test_parts = []

    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc=f"Loading test chunks (City {city_code})"):
        mask = (chunk["d"] >= TEST_DAY_MIN) & (chunk["x"] != MASK_VALUE) & (chunk["y"] != MASK_VALUE)
        sub = chunk.loc[mask, ["uid", "d", "t", "x", "y"]]
        if not sub.empty:
            test_parts.append(sub.copy())
        del chunk

    if test_parts:
        test_df = pd.concat(test_parts, ignore_index=True)
    else:
        test_df = pd.DataFrame(columns=["uid", "d", "t", "x", "y"]).astype(DTYPES)

    print(f"Test DataFrame built: shape = {test_df.shape}")
    return test_df


In [None]:
def generate_unigram_predictions(test_df: pd.DataFrame, per_user_unigram_dict: dict, 
                                gm_x: int, gm_y: int) -> pd.DataFrame:
    """
    Generate predictions using unigram model for each user.
    For each test point, sample from the user's location probability distribution.
    Fallback to global mean for unseen users.
    """
    print(f"Generating Unigram predictions ...")
    
    pred_unigram = test_df[["uid", "d", "t"]].copy()
    pred_unigram["x_pred"] = 0
    pred_unigram["y_pred"] = 0
    
    # Group by user for efficient processing
    for uid, group in tqdm(test_df.groupby("uid"), desc="Unigram sampling"):
        if uid in per_user_unigram_dict:
            # Get user's location distribution
            location_counter = per_user_unigram_dict[uid]
            locations = list(location_counter.keys())
            frequencies = list(location_counter.values())
            
            # Convert frequencies to probabilities
            total_freq = sum(frequencies)
            probabilities = [f / total_freq for f in frequencies]
            
            # Sample locations for all test points of this user
            num_samples = len(group)
            sampled_indices = np.random.choice(len(locations), size=num_samples, p=probabilities)
            sampled_locations = [locations[i] for i in sampled_indices]
            
            # Update predictions for this user
            mask = pred_unigram["uid"] == uid
            pred_unigram.loc[mask, "x_pred"] = [loc[0] for loc in sampled_locations]
            pred_unigram.loc[mask, "y_pred"] = [loc[1] for loc in sampled_locations]
        else:
            # Fallback to global mean for unseen users
            mask = pred_unigram["uid"] == uid
            pred_unigram.loc[mask, "x_pred"] = gm_x
            pred_unigram.loc[mask, "y_pred"] = gm_y
    
    return pred_unigram.astype({"x_pred": "int16", "y_pred": "int16"})


In [None]:
def top_p_sampling(probabilities, top_p=0.7):
    """
    Apply top-p (nucleus) sampling to probability distribution.
    Returns indices and renormalized probabilities.
    """
    # Sort probabilities in descending order
    sorted_indices = np.argsort(probabilities)[::-1]
    sorted_probs = np.array(probabilities)[sorted_indices]
    
    # Calculate cumulative probabilities
    cumulative_probs = np.cumsum(sorted_probs)
    
    # Find cutoff point where cumulative probability exceeds top_p
    cutoff_idx = np.searchsorted(cumulative_probs, top_p) + 1
    cutoff_idx = min(cutoff_idx, len(sorted_probs))
    
    # Select top-p subset
    selected_indices = sorted_indices[:cutoff_idx]
    selected_probs = sorted_probs[:cutoff_idx]
    
    # Renormalize probabilities
    selected_probs = selected_probs / selected_probs.sum()
    
    return selected_indices, selected_probs

def generate_bigram_predictions(test_df: pd.DataFrame, per_user_bigram_dict: dict,
                               per_user_unigram_dict: dict, gm_x: int, gm_y: int,
                               top_p=None) -> pd.DataFrame:
    """
    Generate predictions using bigram model for each user.
    For each test point, use the previous location to predict the next location.
    Apply top-p sampling if specified.
    Fallback to unigram model if no bigram history, then to global mean.
    """
    model_name = f"Bigram Model (top_p={top_p})" if top_p else "Bigram Model"
    print(f"Generating {model_name} predictions ...")
    
    pred_bigram = test_df[["uid", "d", "t"]].copy()
    pred_bigram["x_pred"] = 0
    pred_bigram["y_pred"] = 0
    
    # Process each user separately to maintain sequence order
    for uid, group in tqdm(test_df.groupby("uid"), desc=f"{model_name} sampling"):
        # Sort test points by day and time to maintain sequence
        user_test = group.sort_values(['d', 't']).copy()
        
        if uid in per_user_bigram_dict and per_user_bigram_dict[uid]:
            bigram_counter = per_user_bigram_dict[uid]
            
            # Get the last location from training data as starting context
            # Use the most frequent location as initial context
            if uid in per_user_unigram_dict:
                unigram_counter = per_user_unigram_dict[uid]
                prev_location = unigram_counter.most_common(1)[0][0]
            else:
                prev_location = (gm_x, gm_y)
            
            predictions = []
            
            for idx, row in user_test.iterrows():
                # Find all bigrams that start with prev_location
                next_locations = {}
                for (loc1, loc2), freq in bigram_counter.items():
                    if loc1 == prev_location:
                        next_locations[loc2] = freq
                
                if next_locations:
                    # Sample from next locations
                    locations = list(next_locations.keys())
                    frequencies = list(next_locations.values())
                    total_freq = sum(frequencies)
                    probabilities = [f / total_freq for f in frequencies]
                    
                    if top_p is not None:
                        # Apply top-p sampling
                        selected_indices, selected_probs = top_p_sampling(probabilities, top_p)
                        selected_locations = [locations[i] for i in selected_indices]
                        sampled_location = np.random.choice(len(selected_locations), p=selected_probs)
                        next_location = selected_locations[sampled_location]
                    else:
                        # Regular sampling
                        sampled_idx = np.random.choice(len(locations), p=probabilities)
                        next_location = locations[sampled_idx]
                    
                    predictions.append(next_location)
                    prev_location = next_location
                else:
                    # Fallback to unigram model
                    if uid in per_user_unigram_dict:
                        unigram_counter = per_user_unigram_dict[uid]
                        locations = list(unigram_counter.keys())
                        frequencies = list(unigram_counter.values())
                        total_freq = sum(frequencies)
                        probabilities = [f / total_freq for f in frequencies]
                        sampled_idx = np.random.choice(len(locations), p=probabilities)
                        next_location = locations[sampled_idx]
                    else:
                        next_location = (gm_x, gm_y)
                    
                    predictions.append(next_location)
                    prev_location = next_location
            
            # Update predictions for this user
            user_indices = user_test.index
            pred_bigram.loc[user_indices, "x_pred"] = [pred[0] for pred in predictions]
            pred_bigram.loc[user_indices, "y_pred"] = [pred[1] for pred in predictions]
            
        else:
            # Fallback to unigram model for users without bigram data
            if uid in per_user_unigram_dict:
                location_counter = per_user_unigram_dict[uid]
                locations = list(location_counter.keys())
                frequencies = list(location_counter.values())
                total_freq = sum(frequencies)
                probabilities = [f / total_freq for f in frequencies]
                
                num_samples = len(user_test)
                sampled_indices = np.random.choice(len(locations), size=num_samples, p=probabilities)
                sampled_locations = [locations[i] for i in sampled_indices]
                
                user_indices = user_test.index
                pred_bigram.loc[user_indices, "x_pred"] = [loc[0] for loc in sampled_locations]
                pred_bigram.loc[user_indices, "y_pred"] = [loc[1] for loc in sampled_locations]
            else:
                # Final fallback to global mean
                mask = pred_bigram["uid"] == uid
                pred_bigram.loc[mask, "x_pred"] = gm_x
                pred_bigram.loc[mask, "y_pred"] = gm_y
    
    return pred_bigram.astype({"x_pred": "int16", "y_pred": "int16"})


In [None]:
def process_city(city_code: str) -> dict:
    """
    1. Compute train aggregates
    2. Build test_df
    3. Prepare gt_df
    4. Build each baseline's pred_df (showing progress)
    5. Evaluate GEO-BLEU (with tqdm inside evaluate_geobleu_parallel)
    Returns dict: {baseline_name: GEO-BLEU score}.
    """
    print(f"\n>>> Starting City {city_code}")

    # 1) Train aggregates
    (gm_x, gm_y), (gmod_x, gmod_y), per_user_mean_df, per_user_mode_df, per_user_unigram_dict, per_user_bigram_dict = compute_train_aggregates(city_code)

    # 2) Test DataFrame
    test_df = build_test_dataframe(city_code)

    # 3) Ground-truth DataFrame
    gt_df = test_df.rename(columns={"x": "x_gt", "y": "y_gt"})[["uid", "d", "t", "x_gt", "y_gt"]]

    results = {}

    # 4a) Global Mean Prediction
    print(f"City {city_code} -> Global Mean prediction ...")
    pred_gm = test_df[["uid", "d", "t"]].copy()
    pred_gm["x_pred"] = gm_x
    pred_gm["y_pred"] = gm_y
    score_gm = evaluate_geobleu_parallel(pred_gm, gt_df)
    results["Global Mean"] = round(score_gm, 5)
    print(f"Global Mean GEO-BLEU = {results['Global Mean']}")

    # 4b) Global Mode Prediction
    print(f"City {city_code} -> Global Mode prediction ...")
    pred_gmod = test_df[["uid", "d", "t"]].copy()
    pred_gmod["x_pred"] = gmod_x
    pred_gmod["y_pred"] = gmod_y
    score_gmod = evaluate_geobleu_parallel(pred_gmod, gt_df)
    results["Global Mode"] = round(score_gmod, 5)
    print(f"Global Mode GEO-BLEU = {results['Global Mode']}")

    # 4c) Per-User Mean Prediction
    print(f"City {city_code} -> Per-User Mean prediction ...")
    pred_pum = test_df[["uid", "d", "t"]].copy()
    pred_pum = pred_pum.join(per_user_mean_df, on="uid", how="left", rsuffix="_tmp")
    pred_pum = pred_pum.rename(columns={"x": "x_pred", "y": "y_pred"})
    # Fallback for unseen users
    pred_pum["x_pred"] = pred_pum["x_pred"].fillna(gm_x).astype("int16")
    pred_pum["y_pred"] = pred_pum["y_pred"].fillna(gm_y).astype("int16")
    score_pum = evaluate_geobleu_parallel(pred_pum, gt_df)
    results["Per-User Mean"] = round(score_pum, 5)
    print(f"Per-User Mean GEO-BLEU = {results['Per-User Mean']}")

    # 4d) Per-User Mode Prediction
    print(f"City {city_code} -> Per-User Mode prediction ...")
    pred_pumod = test_df[["uid", "d", "t"]].copy()
    pred_pumod = pred_pumod.join(per_user_mode_df, on="uid", how="left", rsuffix="_tmp")
    pred_pumod = pred_pumod.rename(columns={"x": "x_pred", "y": "y_pred"})
    # Fallback for unseen users
    pred_pumod["x_pred"] = pred_pumod["x_pred"].fillna(gmod_x).astype("int16")
    pred_pumod["y_pred"] = pred_pumod["y_pred"].fillna(gmod_y).astype("int16")
    score_pumod = evaluate_geobleu_parallel(pred_pumod, gt_df)
    results["Per-User Mode"] = round(score_pumod, 5)
    print(f"Per-User Mode GEO-BLEU = {results['Per-User Mode']}")

    # 4e) Unigram Model Prediction
    print(f"City {city_code} -> Unigram Model prediction ...")
    pred_unigram = generate_unigram_predictions(test_df, per_user_unigram_dict, gm_x, gm_y)
    score_unigram = evaluate_geobleu_parallel(pred_unigram, gt_df)
    results["Unigram Model"] = round(score_unigram, 5)
    print(f"Unigram Model GEO-BLEU = {results['Unigram Model']}")

    # 4f) Bigram Model Prediction
    print(f"City {city_code} -> Bigram Model prediction ...")
    pred_bigram = generate_bigram_predictions(test_df, per_user_bigram_dict, per_user_unigram_dict, gm_x, gm_y)
    score_bigram = evaluate_geobleu_parallel(pred_bigram, gt_df)
    results["Bigram Model"] = round(score_bigram, 5)
    print(f"Bigram Model GEO-BLEU = {results['Bigram Model']}")

    # 4g) Bigram Model with top_p=0.7 Prediction
    print(f"City {city_code} -> Bigram Model (top_p=0.7) prediction ...")
    pred_bigram_top_p = generate_bigram_predictions(test_df, per_user_bigram_dict, per_user_unigram_dict, gm_x, gm_y, top_p=0.7)
    score_bigram_top_p = evaluate_geobleu_parallel(pred_bigram_top_p, gt_df)
    results["Bigram Model (top_p=0.7)"] = round(score_bigram_top_p, 5)
    print(f"Bigram Model (top_p=0.7) GEO-BLEU = {results['Bigram Model (top_p=0.7)']}")

    print(f"<<< Finished City {city_code} with results: {results}\n")
    return results


In [None]:
def analyze_data_coverage(city_code: str) -> dict:
    """
    Analyze masked vs unmasked entries in the dataset for a given city.
    Returns statistics about data coverage across training and test periods.
    """
    print(f">>> Analyzing data coverage for City {city_code} ...")
    
    path = os.path.join(DATA_DIR, f"city_{city_code}_challengedata.csv")
    
    # Initialize counters
    stats = {
        'train_total': 0,
        'train_masked': 0,
        'train_unmasked': 0,
        'test_total': 0,
        'test_masked': 0,
        'test_unmasked': 0,
        'unique_users': set(),
        'train_users': set(),
        'test_users': set()
    }
    
    # Process data in chunks
    for chunk in tqdm(pd.read_csv(path, usecols=COLUMNS, dtype=DTYPES, chunksize=CHUNK_SIZE),
                      desc=f"Analyzing coverage (City {city_code})"):
        
        # Split into train and test
        train_chunk = chunk[chunk["d"] <= TRAIN_DAY_MAX]
        test_chunk = chunk[chunk["d"] >= TEST_DAY_MIN]
        
        # Update unique users
        stats['unique_users'].update(chunk['uid'].unique())
        
        # Training data analysis
        if not train_chunk.empty:
            stats['train_total'] += len(train_chunk)
            masked_train = (train_chunk["x"] == MASK_VALUE) | (train_chunk["y"] == MASK_VALUE)
            stats['train_masked'] += masked_train.sum()
            stats['train_unmasked'] += (~masked_train).sum()
            stats['train_users'].update(train_chunk['uid'].unique())
        
        # Test data analysis
        if not test_chunk.empty:
            stats['test_total'] += len(test_chunk)
            masked_test = (test_chunk["x"] == MASK_VALUE) | (test_chunk["y"] == MASK_VALUE)
            stats['test_masked'] += masked_test.sum()
            stats['test_unmasked'] += (~masked_test).sum()
            stats['test_users'].update(test_chunk['uid'].unique())
        
        del chunk, train_chunk, test_chunk
    
    # Convert sets to counts
    stats['unique_users'] = len(stats['unique_users'])
    stats['train_users'] = len(stats['train_users'])
    stats['test_users'] = len(stats['test_users'])
    
    # Calculate percentages
    if stats['train_total'] > 0:
        stats['train_masked_pct'] = (stats['train_masked'] / stats['train_total']) * 100
        stats['train_unmasked_pct'] = (stats['train_unmasked'] / stats['train_total']) * 100
    else:
        stats['train_masked_pct'] = stats['train_unmasked_pct'] = 0
    
    if stats['test_total'] > 0:
        stats['test_masked_pct'] = (stats['test_masked'] / stats['test_total']) * 100
        stats['test_unmasked_pct'] = (stats['test_unmasked'] / stats['test_total']) * 100
    else:
        stats['test_masked_pct'] = stats['test_unmasked_pct'] = 0
    
    return stats

def print_coverage_summary(stats: dict, city_code: str):
    """Print a formatted summary of data coverage statistics."""
    print(f"\n=== Data Coverage Summary for City {city_code} ===")
    print(f"Overall Statistics:")
    print(f"   Total unique users: {stats['unique_users']:,}")
    print(f"   Users in training: {stats['train_users']:,}")
    print(f"   Users in testing: {stats['test_users']:,}")
    
    print(f"\nTraining Period (Days 1-{TRAIN_DAY_MAX}):")
    print(f"   Total entries: {stats['train_total']:,}")
    print(f"   Unmasked entries: {stats['train_unmasked']:,} ({stats['train_unmasked_pct']:.2f}%)")
    print(f"   Masked entries: {stats['train_masked']:,} ({stats['train_masked_pct']:.2f}%)")
    
    print(f"\nTest Period (Days {TEST_DAY_MIN}+):")
    print(f"   Total entries: {stats['test_total']:,}")
    print(f"   Unmasked entries: {stats['test_unmasked']:,} ({stats['test_unmasked_pct']:.2f}%)")
    print(f"   Masked entries: {stats['test_masked']:,} ({stats['test_masked_pct']:.2f}%)")
    
    if stats['test_unmasked'] > 0:
        print(f"\nEvaluation will be performed on {stats['test_unmasked']:,} test entries")
    else:
        print(f"\nNo unmasked test entries found!")

# Run coverage analysis for all cities
print("=" * 60)
print("DATA COVERAGE ANALYSIS")
print("=" * 60)

all_coverage_stats = {}
for city in ["A", "B", "C", "D"]:
    coverage_stats = analyze_data_coverage(city)
    all_coverage_stats[city] = coverage_stats
    print_coverage_summary(coverage_stats, city)
    print()


DATA COVERAGE ANALYSIS
>>> Analyzing data coverage for City A ...


  📊 Analyzing coverage (City A): 0it [00:00, ?it/s]


=== Data Coverage Summary for City A ===
📊 Overall Statistics:
   • Total unique users: 150,000
   • Users in training: 150,000
   • Users in testing: 150,000

📅 Training Period (Days 1-60):
   • Total entries: 67,862,502
   • Unmasked entries: 67,862,502 (100.00%)
   • Masked entries: 0 (0.00%)

🧪 Test Period (Days 61+):
   • Total entries: 19,179,916
   • Unmasked entries: 18,859,525 (98.33%)
   • Masked entries: 320,391 (1.67%)

✅ Evaluation will be performed on 18,859,525 test entries

>>> Analyzing data coverage for City B ...


  📊 Analyzing coverage (City B): 0it [00:00, ?it/s]


=== Data Coverage Summary for City B ===
📊 Overall Statistics:
   • Total unique users: 30,000
   • Users in training: 30,000
   • Users in testing: 30,000

📅 Training Period (Days 1-60):
   • Total entries: 14,194,433
   • Unmasked entries: 14,194,433 (100.00%)
   • Masked entries: 0 (0.00%)

🧪 Test Period (Days 61+):
   • Total entries: 4,002,560
   • Unmasked entries: 3,627,062 (90.62%)
   • Masked entries: 375,498 (9.38%)

✅ Evaluation will be performed on 3,627,062 test entries

>>> Analyzing data coverage for City C ...


  📊 Analyzing coverage (City C): 0it [00:00, ?it/s]


=== Data Coverage Summary for City C ===
📊 Overall Statistics:
   • Total unique users: 25,000
   • Users in training: 25,000
   • Users in testing: 25,000

📅 Training Period (Days 1-60):
   • Total entries: 11,226,812
   • Unmasked entries: 11,226,812 (100.00%)
   • Masked entries: 0 (0.00%)

🧪 Test Period (Days 61+):
   • Total entries: 3,248,335
   • Unmasked entries: 2,953,708 (90.93%)
   • Masked entries: 294,627 (9.07%)

✅ Evaluation will be performed on 2,953,708 test entries

>>> Analyzing data coverage for City D ...


  📊 Analyzing coverage (City D): 0it [00:00, ?it/s]


=== Data Coverage Summary for City D ===
📊 Overall Statistics:
   • Total unique users: 20,000
   • Users in training: 20,000
   • Users in testing: 20,000

📅 Training Period (Days 1-60):
   • Total entries: 9,358,783
   • Unmasked entries: 9,358,783 (100.00%)
   • Masked entries: 0 (0.00%)

🧪 Test Period (Days 61+):
   • Total entries: 2,671,295
   • Unmasked entries: 2,361,882 (88.42%)
   • Masked entries: 309,413 (11.58%)

✅ Evaluation will be performed on 2,361,882 test entries



In [None]:
methods = ["Global Mean", "Global Mode", "Per-User Mean", "Per-User Mode", "Unigram Model", "Bigram Model", "Bigram Model (top_p=0.7)"]
all_scores = {method: [] for method in methods}

for city in CITIES:
    city_scores = process_city(city)
    for method in methods:
        all_scores[method].append(city_scores[method])

df_results = pd.DataFrame(
    all_scores,
    index=[f"City {c}" for c in CITIES]
).T
df_results["Average"] = df_results.mean(axis=1)

print("\n=== Final GEO-BLEU Scores ===")
display(df_results)



>>> Starting City D
>>> Computing train aggregates for City D …


  📥 Loading chunks (City D): 0it [00:00, ?it/s]

    ✔ Train aggregates done: GM=(110,91), GMODE=(142,107), 20000 users' means, 20000 users' modes, 20000 users' unigrams.
>>> Building test DataFrame for City D …


  📥 Loading test chunks (City D): 0it [00:00, ?it/s]

    ✔ Test DataFrame built: shape = (2361882, 5)
  • City D → Global Mean prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      ⏳ GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

    ✔ Global Mean GEO-BLEU = 1e-05
  • City D → Global Mode prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      ⏳ GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

    ✔ Global Mode GEO-BLEU = 0.00191
  • City D → Per-User Mean prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      ⏳ GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

    ✔ Per-User Mean GEO-BLEU = 0.02026
  • City D → Per-User Mode prediction ...
    ▶ Evaluating GEO-BLEU on 17000 users...


      ⏳ GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

    ✔ Per-User Mode GEO-BLEU = 0.08866
  • City D → Unigram Model prediction ...
  • Generating Unigram predictions ...


    ⏳ Unigram sampling:   0%|          | 0/17000 [00:00<?, ?it/s]

    ▶ Evaluating GEO-BLEU on 17000 users...


      ⏳ GEO-BLEU:   0%|          | 0/17000 [00:00<?, ?it/s]

    ✔ Unigram Model GEO-BLEU = 0.04331
<<< Finished City D with results: {'Global Mean': 1e-05, 'Global Mode': 0.00191, 'Per-User Mean': 0.02026, 'Per-User Mode': 0.08866, 'Unigram Model': 0.04331}


=== Final GEO-BLEU Scores ===


Unnamed: 0,City D,Average
Global Mean,1e-05,1e-05
Global Mode,0.00191,0.00191
Per-User Mean,0.02026,0.02026
Per-User Mode,0.08866,0.08866
Unigram Model,0.04331,0.04331
