# Human Mobility Prediction with a Sequential LSTM Model

This notebook implements a complete workflow for the SIGSPatial GIS Cup challenge. The goal is to predict a user's location (`x`, `y`) based on their historical movement patterns.

### Workflow
1.  **Setup**: Install libraries and define the configuration.
2.  **Preprocessing (CPU Task)**: Run the data processing on a CPU instance to generate and save sequence files (`.npy`). All outputs will be saved in a city-specific folder (e.g., `/kaggle/working/city_c/`).
3.  **Training (GPU Task)**: Switch to a GPU instance, load the processed files from the city's folder, and train the LSTM model.
4.  **Smoke Test**: After training, run a quick test to ensure the saved model loads and can make predictions.

In [1]:
# !rm -rf /kaggle/working/*

In [2]:
# ===================================================================
# 1. SETUP & INSTALLATIONS
# ===================================================================
print("Installing GeoBLEU library...")
!pip install -q git+https://github.com/yahoojapan/geobleu.git > /dev/null

import os
import json
import time
import shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

import geobleu
print("Setup complete.")

Installing GeoBLEU library...
Setup complete.


In [3]:
# ===================================================================
# 2. CONFIGURATION
# ===================================================================
FILE_PATHS = {
    'city_a': '/kaggle/input/humob-data/15313913/city_A_challengedata.csv',
    'city_b': '/kaggle/input/humob-data/15313913/city_B_challengedata.csv',
    'city_c': '/kaggle/input/humob-data/15313913/city_C_challengedata.csv',
    'city_d': '/kaggle/input/humob-data/15313913/city_D_challengedata.csv',
    'city_f': '/kaggle/input/dummy-data/city_F_challengedata.csv',
}
CITY_TO_PROCESS = 'city_d' # <-- SELECT THE CITY TO RUN

BASE_OUTPUT_DIR = '/kaggle/working/'
OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, CITY_TO_PROCESS)

# --- Model & Training Hyperparameters ---
SEQ_LEN = 8
VAL_USER_FRACTION = 0.1
BATCH_SIZE = 1024
LEARNING_RATE = 0.001
NUM_EPOCHS = 5

LOCATION_EMBEDDING_DIM = 64
LSTM_HIDDEN_DIM = 128

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")
print(f"All outputs will be saved to: {OUTPUT_DIR}")

os.makedirs(OUTPUT_DIR, exist_ok=True)

Using device: cuda
All outputs will be saved to: /kaggle/working/city_d


## 3. Data Preprocessing (CPU Task) ⚙️

This step is best run on a **CPU instance**. It processes the raw CSV and saves the results to the city-specific output directory.

In [4]:
def create_and_save_sequences(city_path, output_dir):
    metadata_path = os.path.join(output_dir, 'metadata.json')
    if os.path.exists(metadata_path):
        print("Preprocessing already complete. Skipping.")
        with open(metadata_path, 'r') as f:
            return json.load(f)

    print(f"Starting preprocessing for {city_path}...")
    start_time = time.time()

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    print(f"Cleaned and prepared output directory: {output_dir}")

    print("Loading data into memory...")
    df = pd.read_csv(city_path)
    df_train_full = df[df['x'] != 999].copy()
    del df

    max_x, max_y = df_train_full['x'].max(), df_train_full['y'].max()
    num_locations = (max_x + 1) * (max_y + 1)
    df_train_full['x_norm'] = df_train_full['x'] / max_x
    df_train_full['y_norm'] = df_train_full['y'] / max_y

    all_uids = df_train_full['uid'].unique()
    train_uids, val_uids = train_test_split(all_uids, test_size=VAL_USER_FRACTION, random_state=42)
    train_uids_set, val_uids_set = set(train_uids), set(val_uids)
    print(f"Grid size: {max_x+1}x{max_y+1}. Found {len(train_uids_set)} train users, {len(val_uids_set)} val users.")

    print("Sorting data and grouping by user to build sequences...")
    df_train_full = df_train_full.sort_values(['uid', 'd', 't'])
    grouped_users = df_train_full.groupby('uid')
    
    USER_BATCH_SIZE = 5000
    train_file_paths, val_file_paths = {"seq": [], "tar": []}, {"seq": [], "tar": [], "meta": []}
    
    user_count = 0
    train_seq_batch, train_tar_batch = [], []
    val_seq_batch, val_tar_batch, val_meta_batch = [], [], []

    for uid, user_df in tqdm(grouped_users, desc="Processing users"):
        user_df['location_id'] = user_df['x'] * (max_y + 1) + user_df['y']
        user_df['time_sin'] = np.sin(2 * np.pi * user_df['t'] / 48.0)
        user_df['time_cos'] = np.cos(2 * np.pi * user_df['t'] / 48.0)
        user_df['dow_sin'] = np.sin(2 * np.pi * (user_df['d'] % 7) / 7.0)
        user_df['dow_cos'] = np.cos(2 * np.pi * (user_df['d'] % 7) / 7.0)
        
        feature_cols = ['x_norm', 'y_norm', 'time_sin', 'time_cos', 'dow_sin', 'dow_cos']
        target_col = 'location_id'
        meta_cols = ['uid', 'd', 't']
        
        features = user_df[feature_cols].values.astype(np.float32)
        targets_raw = user_df[target_col].values
        meta = user_df[meta_cols].values
        
        if len(features) <= SEQ_LEN: continue
            
        sequences = np.array([features[i:i + SEQ_LEN] for i in range(len(features) - SEQ_LEN)])
        targets = np.array([targets_raw[i + SEQ_LEN] for i in range(len(features) - SEQ_LEN)], dtype=np.int64)
        target_meta = np.array([meta[i + SEQ_LEN] for i in range(len(features) - SEQ_LEN)])

        if uid in train_uids_set:
            train_seq_batch.append(sequences)
            train_tar_batch.append(targets)
        elif uid in val_uids_set:
            val_seq_batch.append(sequences)
            val_tar_batch.append(targets)
            val_meta_batch.append(target_meta)
            
        user_count += 1
        if user_count % USER_BATCH_SIZE == 0:
            if train_seq_batch:
                seq_path = os.path.join(output_dir, f"train_seq_part_{len(train_file_paths['seq'])}.npy")
                tar_path = os.path.join(output_dir, f"train_tar_part_{len(train_file_paths['tar'])}.npy")
                np.save(seq_path, np.concatenate(train_seq_batch))
                np.save(tar_path, np.concatenate(train_tar_batch))
                train_file_paths['seq'].append(seq_path)
                train_file_paths['tar'].append(tar_path)
                train_seq_batch, train_tar_batch = [], []
            if val_seq_batch:
                seq_path = os.path.join(output_dir, f"val_seq_part_{len(val_file_paths['seq'])}.npy")
                tar_path = os.path.join(output_dir, f"val_tar_part_{len(val_file_paths['tar'])}.npy")
                meta_path = os.path.join(output_dir, f"val_meta_part_{len(val_file_paths['meta'])}.npy")
                np.save(seq_path, np.concatenate(val_seq_batch))
                np.save(tar_path, np.concatenate(val_tar_batch))
                np.save(meta_path, np.concatenate(val_meta_batch))
                val_file_paths['seq'].append(seq_path)
                val_file_paths['tar'].append(tar_path)
                val_file_paths['meta'].append(meta_path)
                val_seq_batch, val_tar_batch, val_meta_batch = [], [], []

    # Save any remaining data
    if train_seq_batch:
        seq_path = os.path.join(output_dir, f"train_seq_part_{len(train_file_paths['seq'])}.npy")
        tar_path = os.path.join(output_dir, f"train_tar_part_{len(train_file_paths['tar'])}.npy")
        np.save(seq_path, np.concatenate(train_seq_batch))
        np.save(tar_path, np.concatenate(train_tar_batch))
        train_file_paths['seq'].append(seq_path)
        train_file_paths['tar'].append(tar_path)
    if val_seq_batch:
        seq_path = os.path.join(output_dir, f"val_seq_part_{len(val_file_paths['seq'])}.npy")
        tar_path = os.path.join(output_dir, f"val_tar_part_{len(val_file_paths['tar'])}.npy")
        meta_path = os.path.join(output_dir, f"val_meta_part_{len(val_file_paths['meta'])}.npy")
        np.save(seq_path, np.concatenate(val_seq_batch))
        np.save(tar_path, np.concatenate(val_tar_batch))
        np.save(meta_path, np.concatenate(val_meta_batch))
        val_file_paths['seq'].append(seq_path)
        val_file_paths['tar'].append(tar_path)
        val_file_paths['meta'].append(meta_path)
        
    metadata = {
        'num_locations': int(num_locations),
        'max_y': int(max_y),
        'train_files': train_file_paths,
        'val_files': val_file_paths
    }
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f)

    print(f"Preprocessing finished in {(time.time() - start_time):.2f} seconds.")
    return metadata

metadata = create_and_save_sequences(FILE_PATHS[CITY_TO_PROCESS], OUTPUT_DIR)

Starting preprocessing for /kaggle/input/humob-data/15313913/city_D_challengedata.csv...
Cleaned and prepared output directory: /kaggle/working/city_d
Loading data into memory...
Grid size: 201x201. Found 18000 train users, 2000 val users.
Sorting data and grouping by user to build sequences...


Processing users:   0%|          | 0/20000 [00:00<?, ?it/s]

Preprocessing finished in 88.97 seconds.


In [5]:
# # Insert this cell AFTER preprocessing and BEFORE training
# import numpy as np

# try:
#     # Load the training targets file we just created
#     targets_check = np.load('/kaggle/working/processed_data/train_tar.npy')
#     print(f"Data type of saved targets: {targets_check.dtype}")
    
#     if targets_check.dtype == np.int64:
#         print("✅ Verification PASSED: Targets are correctly saved as integers.")
#     else:
#         print("❌ Verification FAILED: Targets are NOT integers. Please re-check the preprocessing script.")
# except FileNotFoundError:
#     print("❌ Verification FAILED: train_tar.npy not found. Did preprocessing complete successfully?")

## 4. Model Training & Evaluation (GPU Task) 🚀

With the data processed, switch to a **GPU instance**. The following cells will load the `.npy` files and run the training, using the corrected `geobleu.calc_geobleu_bulk` function for evaluation.

In [6]:
# ===================================================================
# CUSTOM DATASET & MODEL DEFINITION
# ===================================================================

class MultiFileSequenceDataset(Dataset):
    """A PyTorch Dataset that reads from multiple .npy files."""
    def __init__(self, file_paths):
        self.paths = file_paths
        self.seq_files = [np.load(p, mmap_mode='r') for p in self.paths['seq']]
        self.tar_files = [np.load(p, mmap_mode='r') for p in self.paths['tar']]
        
        self.meta_files = None
        if 'meta' in self.paths:
            self.meta_files = [np.load(p, mmap_mode='r') for p in self.paths['meta']]
        
        self.file_lengths = [len(f) for f in self.seq_files]
        self.cumulative_lengths = np.cumsum(self.file_lengths)

    def __len__(self):
        return self.cumulative_lengths[-1]

    def __getitem__(self, idx):
        file_idx = np.searchsorted(self.cumulative_lengths, idx, side='right')
        local_idx = idx - (self.cumulative_lengths[file_idx - 1] if file_idx > 0 else 0)
        
        seq = torch.from_numpy(self.seq_files[file_idx][local_idx].copy())
        # The target is a single number, so it's less critical, but good practice
        tar = torch.tensor(self.tar_files[file_idx][local_idx].copy(), dtype=torch.long)
        
        if self.meta_files:
            meta = torch.from_numpy(self.meta_files[file_idx][local_idx])
            return seq, tar, meta
        else:
            return seq, tar

class MobilityLSTM(nn.Module):
    def __init__(self, num_locations):
        super().__init__()
        lstm_input_size = 6 
        self.lstm = nn.LSTM(input_size=lstm_input_size, hidden_size=LSTM_HIDDEN_DIM, batch_first=True)
        self.fc = nn.Linear(LSTM_HIDDEN_DIM, num_locations)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])

In [7]:
# ===================================================================
# TRAINING & EVALUATION SCRIPT
# ===================================================================
def run_training(output_dir, city_name, metadata):
    print(f"Loading pre-built sequences from: {output_dir}")
    
    train_dataset = MultiFileSequenceDataset(metadata['train_files'])
    val_dataset = MultiFileSequenceDataset(metadata['val_files'])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = MobilityLSTM(num_locations=metadata['num_locations']).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    print("Starting model training...")
    for epoch in range(NUM_EPOCHS):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: ?")
        for seq_batch, tar_batch in pbar:
            seq_batch, tar_batch = seq_batch.to(DEVICE), tar_batch.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(seq_batch)
            loss = criterion(outputs, tar_batch)
            loss.backward()
            optimizer.step()
            pbar.set_description(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {loss.item():.4f}")
    
    print("\nStarting final evaluation on validation set...")
    model.eval()
    generated_points, reference_points = [], []
    max_y = metadata['max_y']

    with torch.no_grad():
        for seq_batch, tar_batch, meta_batch in tqdm(val_loader, desc="Evaluating"):
            seq_batch = seq_batch.to(DEVICE)
            outputs = model(seq_batch)
            _, predicted_ids = torch.max(outputs, 1)
            
            predicted_ids = predicted_ids.cpu().numpy()
            tar_batch = tar_batch.cpu().numpy()
            meta_batch = meta_batch.cpu().numpy()

            for i in range(len(predicted_ids)):
                uid, d, t = meta_batch[i]
                pred_loc, true_loc = predicted_ids[i], tar_batch[i]
                pred_x, pred_y = pred_loc // (max_y + 1), pred_loc % (max_y + 1)
                true_x, true_y = true_loc // (max_y + 1), true_loc % (max_y + 1)
                
                generated_points.append((int(uid), int(d), int(t), int(pred_x), int(pred_y)))
                reference_points.append((int(uid), int(d), int(t), int(true_x), int(true_y)))
                
    print("Calculating GEO-BLEU score using calc_geobleu_bulk...")
    final_score = geobleu.calc_geobleu_bulk(generated_points, reference_points)
    
    print("\n" + "="*50)
    print(f"Final Validation GeoBLEU Score for {city_name}: {final_score:.6f}")
    print("="*50)

    model_path = os.path.join(output_dir, f'{city_name}_model.pth')
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

run_training(OUTPUT_DIR, CITY_TO_PROCESS, metadata)

Loading pre-built sequences from: /kaggle/working/city_d
Starting model training...


Epoch 1/5 | Loss: ?:   0%|          | 0/10157 [00:00<?, ?it/s]

Epoch 2/5 | Loss: ?:   0%|          | 0/10157 [00:00<?, ?it/s]

Epoch 3/5 | Loss: ?:   0%|          | 0/10157 [00:00<?, ?it/s]

Epoch 4/5 | Loss: ?:   0%|          | 0/10157 [00:00<?, ?it/s]

Epoch 5/5 | Loss: ?:   0%|          | 0/10157 [00:00<?, ?it/s]


Starting final evaluation on validation set...


Evaluating:   0%|          | 0/1134 [00:00<?, ?it/s]

  meta = torch.from_numpy(self.meta_files[file_idx][local_idx])


Calculating GEO-BLEU score using calc_geobleu_bulk...

Final Validation GeoBLEU Score for city_d: 0.045444
Model saved to /kaggle/working/city_d/city_d_model.pth


## 5. Smoke Test ✅

This final step performs a quick sanity check. It loads the model we just saved and makes a prediction on a single data point from the validation set to ensure the entire inference pipeline is working correctly.

In [8]:
# ===================================================================
# SMOKE TEST
# ===================================================================
print("Performing smoke test...")
model_path = os.path.join(OUTPUT_DIR, f'{CITY_TO_PROCESS}_model.pth')
val_seq_path = metadata['val_files']['seq'][0] # Load first validation chunk
val_tar_path = metadata['val_files']['tar'][0]

if not os.path.exists(model_path):
    print("Smoke Test FAILED: Model file not found.")
else:
    try:
        sample_input_np = np.load(val_seq_path)[0]
        sample_target = np.load(val_tar_path)[0]

        model_instance = MobilityLSTM(num_locations=metadata['num_locations'])
        model_instance.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model_instance.to(DEVICE)
        model_instance.eval()

        sample_input_tensor = torch.from_numpy(sample_input_np).unsqueeze(0).to(DEVICE)

        with torch.no_grad():
            prediction_logits = model_instance(sample_input_tensor)
            predicted_location_id = torch.argmax(prediction_logits, dim=1).item()
        
        print("\n--- Smoke Test Results ---")
        print(f"Predicted Location ID: {predicted_location_id}")
        print(f"Actual Location ID:    {sample_target}")
        
        if predicted_location_id == sample_target:
            print("Metric: Correct Prediction! ✅")
        else:
            print("Metric: Incorrect Prediction. ❌")
        
        print("\nSmoke Test PASSED: Model loaded and inference completed without errors.")

    except Exception as e:
        print(f"Smoke Test FAILED with an error: {e}")

Performing smoke test...

--- Smoke Test Results ---
Predicted Location ID: 31484
Actual Location ID:    30280
Metric: Incorrect Prediction. ❌

Smoke Test PASSED: Model loaded and inference completed without errors.
