# Geolife Dataset: Complete Preprocessing Pipeline

## 📋 Overview

This **self-contained** notebook walks through the entire Geolife preprocessing pipeline from raw GPS data to final `.pk` training files.

### Pipeline: 10 Steps

**PART 1**: Raw Data Processing (Steps 1-8)
1. Load GPS position fixes
2. Generate staypoints (>30min, <200m)
3. Detect activities (≥25min)
4. Filter users (>50 days)
5. Cluster locations (DBSCAN, ε=20m)
6. Merge staypoints (gap <1min)
7. Add time features
8. Validate sequences

**PART 2**: Transformer Data Generation (Steps 9-10)
9. Split dataset (60/20/20)
10. Generate .pk files

### Expected Results
- 45 users
- 7,424 train sequences
- 1,186 locations

### Note
Run cells sequentially. Adjust `raw_data_path` in config to your Geolife data location.

## Imports

In [None]:
import os, sys, json, pickle, datetime
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt

from sklearn.preprocessing import OrdinalEncoder
from joblib import Parallel, delayed

import trackintel as ti
from trackintel.io.dataset_reader import read_geolife
from trackintel.preprocessing.triplegs import generate_trips
from trackintel.analysis.tracking_quality import temporal_tracking_quality, _split_overlaps

import warnings
warnings.filterwarnings('ignore')
print('✓ Imports complete')

## Configuration

In [None]:
CONFIG = {    'dataset': {        'name': 'geolife',        'raw_data_path': './data/geolife/Data',  # ← ADJUST THIS PATH        'output_dir': './data/geolife'    },    'staypoints': {'dist_threshold': 200, 'time_threshold': 30, 'gap_threshold': 1440, 'include_last': True, 'print_progress': True, 'n_jobs': -1},    'activity_flag': {'method': 'time_threshold', 'time_threshold': 25},    'user_quality': {'day_filter': 50, 'window_size': 10, 'min_thres': None, 'mean_thres': None},    'locations': {'epsilon': 20, 'num_samples': 2, 'distance_metric': 'haversine', 'agg_level': 'dataset', 'n_jobs': -1},    'staypoint_merging': {'max_time_gap': '1min'},    'sequence_generation': {'previous_days': [7]},    'seed': 42}np.random.seed(CONFIG['seed'])os.makedirs(CONFIG['dataset']['output_dir'], exist_ok=True)os.makedirs(os.path.join(CONFIG['dataset']['output_dir'], 'quality'), exist_ok=True)print(f"✓ Config: ε={CONFIG['locations']['epsilon']}m, history={CONFIG['sequence_generation']['previous_days'][0]}days")

## Utility Functions

Functions from `utils.py` - used throughout the pipeline.

In [None]:
import geopandas as gpdimport pandas as pdfrom shapely import wktfrom tqdm import tqdmimport datetimefrom trackintel.analysis.tracking_quality import temporal_tracking_quality, _split_overlapsdef preprocess_to_ti(df):    """Change dataframe to trackintel compatible format"""    df.rename(        columns={"userid": "user_id", "startt": "started_at", "endt": "finished_at", "dur_s": "duration"},        inplace=True,    )    # read the time info    df["started_at"] = pd.to_datetime(df["started_at"])    df["finished_at"] = pd.to_datetime(df["finished_at"])    df["started_at"] = df["started_at"].dt.tz_localize(tz="utc")    df["finished_at"] = df["finished_at"].dt.tz_localize(tz="utc")    df["duration"] = (df["finished_at"] - df["started_at"]).dt.total_seconds()    # drop invalid    df.drop(index=df[df["duration"] < 0].index, inplace=True)    df.set_index("id", inplace=True)    tqdm.pandas(desc="Load geometry")    df["geom"] = df["geom"].progress_apply(wkt.loads)    return gpd.GeoDataFrame(df, crs="EPSG:4326", geometry="geom")def filter_duplicates(sp, tpls):    # merge trips and staypoints    sp["type"] = "sp"    tpls["type"] = "tpl"    df_all = pd.merge(sp, tpls, how="outer")    df_all = df_all.groupby("user_id", as_index=False).apply(_alter_diff)    sp = df_all.loc[df_all["type"] == "sp"].drop(columns=["type"])    tpls = df_all.loc[df_all["type"] == "tpl"].drop(columns=["type"])    sp = sp[["id", "user_id", "started_at", "finished_at", "geom", "duration", "is_activity"]]    tpls = tpls[["id", "user_id", "started_at", "finished_at", "geom", "length_m", "duration", "mode"]]    return sp.set_index("id"), tpls.set_index("id")def _alter_diff(df):    df.sort_values(by="started_at", inplace=True)    df["diff"] = pd.NA    df["st_next"] = pd.NA    diff = df["started_at"].iloc[1:].reset_index(drop=True) - df["finished_at"].iloc[:-1].reset_index(drop=True)    df["diff"].iloc[:-1] = diff.dt.total_seconds()    df["st_next"].iloc[:-1] = df["started_at"].iloc[1:].reset_index(drop=True)    df.loc[df["diff"] < 0, "finished_at"] = df.loc[df["diff"] < 0, "st_next"]    df["started_at"], df["finished_at"] = pd.to_datetime(df["started_at"]), pd.to_datetime(df["finished_at"])    df["duration"] = (df["finished_at"] - df["started_at"]).dt.total_seconds()    # print(df.loc[df["diff"] < 0])    df.drop(columns=["diff", "st_next"], inplace=True)    df.drop(index=df[df["duration"] <= 0].index, inplace=True)    return dfdef enrich_time_info(sp):    sp = sp.groupby("user_id", group_keys=False).apply(_get_time)    sp.drop(columns={"finished_at", "started_at"}, inplace=True)    sp.sort_values(by=["user_id", "start_day", "start_min"], inplace=True)    sp = sp.reset_index(drop=True)    #    sp["location_id"] = sp["location_id"].astype(int)    sp["user_id"] = sp["user_id"].astype(int)    # final cleaning, reassign ids    sp.index.name = "id"    sp.reset_index(inplace=True)    return spdef _get_time(df):    min_day = pd.to_datetime(df["started_at"].min().date())    df["started_at"] = df["started_at"].dt.tz_localize(tz=None)    df["finished_at"] = df["finished_at"].dt.tz_localize(tz=None)    df["start_day"] = (df["started_at"] - min_day).dt.days    df["end_day"] = (df["finished_at"] - min_day).dt.days    df["start_min"] = df["started_at"].dt.hour * 60 + df["started_at"].dt.minute    df["end_min"] = df["finished_at"].dt.hour * 60 + df["finished_at"].dt.minute    df.loc[df["end_min"] == 0, "end_min"] = 24 * 60    df["weekday"] = df["started_at"].dt.weekday    return dfdef calculate_user_quality(sp, trips, file_path, quality_filter):    trips["started_at"] = pd.to_datetime(trips["started_at"]).dt.tz_localize(None)    trips["finished_at"] = pd.to_datetime(trips["finished_at"]).dt.tz_localize(None)    sp["started_at"] = pd.to_datetime(sp["started_at"]).dt.tz_localize(None)    sp["finished_at"] = pd.to_datetime(sp["finished_at"]).dt.tz_localize(None)    # merge trips and staypoints    print("starting merge", sp.shape, trips.shape)    sp["type"] = "sp"    trips["type"] = "tpl"    df_all = pd.concat([sp, trips])    df_all = _split_overlaps(df_all, granularity="day")    df_all["duration"] = (df_all["finished_at"] - df_all["started_at"]).dt.total_seconds()    print("finished merge", df_all.shape)    print("*" * 50)    if "min_thres" in quality_filter:        end_period = datetime.datetime(2017, 12, 26)        df_all = df_all.loc[df_all["finished_at"] < end_period]    print(len(df_all["user_id"].unique()))    # get quality    total_quality = temporal_tracking_quality(df_all, granularity="all")    # get tracking days    total_quality["days"] = (        df_all.groupby("user_id").apply(lambda x: (x["finished_at"].max() - x["started_at"].min()).days).values    )    # filter based on days    user_filter_day = (        total_quality.loc[(total_quality["days"] > quality_filter["day_filter"])]        .reset_index(drop=True)["user_id"]        .unique()    )    sliding_quality = (        df_all.groupby("user_id")        .apply(_get_tracking_quality, window_size=quality_filter["window_size"])        .reset_index(drop=True)    )    filter_after_day = sliding_quality.loc[sliding_quality["user_id"].isin(user_filter_day)]    if "min_thres" in quality_filter:        # filter based on quanlity        filter_after_day = (            filter_after_day.groupby("user_id")            .apply(_filter_user, min_thres=quality_filter["min_thres"], mean_thres=quality_filter["mean_thres"])            .reset_index(drop=True)            .dropna()        )    filter_after_user_quality = filter_after_day.groupby("user_id", as_index=False)["quality"].mean()    print("final selected user", filter_after_user_quality.shape[0])    filter_after_user_quality.to_csv(file_path, index=False)    return filter_after_user_quality["user_id"].valuesdef _filter_user(df, min_thres, mean_thres):    consider = df.loc[df["quality"] != 0]    if (consider["quality"].min() > min_thres) and (consider["quality"].mean() > mean_thres):        return dfdef _get_tracking_quality(df, window_size):    weeks = (df["finished_at"].max() - df["started_at"].min()).days // 7    start_date = df["started_at"].min().date()    quality_list = []    # construct the sliding week gdf    for i in range(0, weeks - window_size + 1):        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())        curr_end = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())        # the total df for this time window        cAll_gdf = df.loc[(df["started_at"] >= curr_start) & (df["finished_at"] < curr_end)]        if cAll_gdf.shape[0] == 0:            continue        total_sec = (curr_end - curr_start).total_seconds()        quality_list.append([i, cAll_gdf["duration"].sum() / total_sec])    ret = pd.DataFrame(quality_list, columns=["timestep", "quality"])    ret["user_id"] = df["user_id"].unique()[0]    return retdef split_dataset(totalData):    """Split dataset into train, vali and test."""    totalData = totalData.groupby("user_id",group_keys=False).apply(_get_split_days_user)    train_data = totalData.loc[totalData["Dataset"] == "train"].copy()    vali_data = totalData.loc[totalData["Dataset"] == "vali"].copy()    test_data = totalData.loc[totalData["Dataset"] == "test"].copy()    # final cleaning    train_data.drop(columns={"Dataset"}, inplace=True)    vali_data.drop(columns={"Dataset"}, inplace=True)    test_data.drop(columns={"Dataset"}, inplace=True)    return train_data, vali_data, test_datadef _get_split_days_user(df):    """Split the dataset according to the tracked day of each user."""    maxDay = df["start_day"].max()    train_split = maxDay * 0.6    validation_split = maxDay * 0.8    df["Dataset"] = "test"    df.loc[df["start_day"] < train_split, "Dataset"] = "train"    df.loc[(df["start_day"] >= train_split) & (df["start_day"] < validation_split), "Dataset"] = "vali"    return dfdef get_valid_sequence(input_df, previous_day=14):    valid_id = []    for user in input_df["user_id"].unique():        df = input_df.loc[input_df["user_id"] == user].copy().reset_index(drop=True)        min_days = df["start_day"].min()        df["diff_day"] = df["start_day"] - min_days        for index, row in df.iterrows():            # exclude the first records            if row["diff_day"] < previous_day:                continue            hist = df.iloc[:index]            hist = hist.loc[(hist["start_day"] >= (row["start_day"] - previous_day))]            if len(hist) < 3:                continue            valid_id.append(row["id"])    return valid_id

# PART 1: Raw Data Preprocessing

---

## Step 1: Load Geolife Data

In [None]:
print('='*80)
print('LOADING GEOLIFE DATA')
print('='*80)

pfs, _ = read_geolife(CONFIG['dataset']['raw_data_path'], print_progress=True)
print(f'\n✓ Loaded {len(pfs)} position fixes from {pfs["user_id"].nunique()} users')

## Step 2: Generate Staypoints

In [None]:
print('='*80)
print('GENERATING STAYPOINTS')
print('='*80)

pfs, sp = pfs.as_positionfixes.generate_staypoints(
    gap_threshold=CONFIG['staypoints']['gap_threshold'],
    include_last=CONFIG['staypoints']['include_last'],
    print_progress=CONFIG['staypoints']['print_progress'],
    dist_threshold=CONFIG['staypoints']['dist_threshold'],
    time_threshold=CONFIG['staypoints']['time_threshold'],
    n_jobs=CONFIG['staypoints']['n_jobs']
)
print(f'\n✓ Generated {len(sp)} staypoints')

## Step 3: Create Activity Flags

In [None]:
sp = sp.as_staypoints.create_activity_flag(
    method=CONFIG['activity_flag']['method'],
    time_threshold=CONFIG['activity_flag']['time_threshold']
)
print(f'✓ {sp["is_activity"].sum()} / {len(sp)} are activities')

## Step 4: Filter Users by Quality

In [None]:
print('='*80)
print('USER QUALITY FILTERING')
print('='*80)

quality_file = os.path.join(CONFIG['dataset']['output_dir'], 'quality', 'geolife_slide_filtered.csv')

if Path(quality_file).is_file():
    valid_user = pd.read_csv(quality_file)['user_id'].values
    print(f'✓ Loaded {len(valid_user)} users from existing quality file')
else:
    print('Generating quality metrics...')
    pfs, tpls = pfs.as_positionfixes.generate_triplegs(sp)
    sp, tpls, trips = generate_trips(sp, tpls, add_geometry=False)
    
    quality_filter = {
        'day_filter': CONFIG['user_quality']['day_filter'],
        'window_size': CONFIG['user_quality']['window_size']
    }
    if CONFIG['user_quality'].get('min_thres') is not None:
        quality_filter['min_thres'] = CONFIG['user_quality']['min_thres']
        quality_filter['mean_thres'] = CONFIG['user_quality']['mean_thres']
    
    valid_user = calculate_user_quality(sp.copy(), trips.copy(), quality_filter)
    pd.DataFrame({'user_id': valid_user}).to_csv(quality_file, index=False)
    print(f'✓ Saved quality file')

sp = sp.loc[sp['user_id'].isin(valid_user)]
print(f'\n✓ {len(valid_user)} valid users, {len(sp)} staypoints')

## Step 5: Keep Only Activities

In [None]:
sp = sp.loc[sp['is_activity'] == True]
print(f'✓ Filtered to {len(sp)} activity staypoints')

## Step 6: Generate Locations (DBSCAN Clustering)

In [None]:
print('='*80)
print('GENERATING LOCATIONS')
print('='*80)

sp, locs = sp.as_staypoints.generate_locations(
    epsilon=CONFIG['locations']['epsilon'],
    num_samples=CONFIG['locations']['num_samples'],
    distance_metric=CONFIG['locations']['distance_metric'],
    agg_level=CONFIG['locations']['agg_level'],
    n_jobs=CONFIG['locations']['n_jobs']
)

sp = sp.loc[~sp['location_id'].isna()].copy()
print(f'✓ {sp["location_id"].nunique()} unique locations in {len(sp)} staypoints')

# Save locations
locs = locs[~locs.index.duplicated(keep='first')]
filtered_locs = locs.loc[locs.index.isin(sp['location_id'].unique())]
filtered_locs.as_locations.to_csv(os.path.join(CONFIG['dataset']['output_dir'], 'locations_geolife.csv'))
print(f'✓ Saved locations')

## Step 7: Merge Staypoints

In [None]:
sp = sp[['user_id', 'started_at', 'finished_at', 'geom', 'location_id']]
sp_merged = sp.as_staypoints.merge_staypoints(
    triplegs=pd.DataFrame([]),
    max_time_gap=CONFIG['staypoint_merging']['max_time_gap'],
    agg={'location_id': 'first'}
)
sp_merged['duration'] = (sp_merged['finished_at'] - sp_merged['started_at']).dt.total_seconds() // 60
print(f'✓ Merged: {len(sp)} → {len(sp_merged)} staypoints')
sp = sp_merged

## Step 8: Add Temporal Features

In [None]:
sp_time = enrich_time_info(sp)
print(f'✓ Added temporal features: {len(sp_time)} staypoints, {sp_time["user_id"].nunique()} users')
sp_time.to_csv(os.path.join(CONFIG['dataset']['output_dir'], 'sp_time_temp_geolife.csv'), index=False)
print(f'✓ Saved intermediate result')

## Step 9: Sequence Validation & Final Filtering

Find staypoints with sufficient historical context and filter users to ensure all splits have data.

In [None]:
print('='*80)
print('SEQUENCE VALIDATION & FINAL FILTERING')
print('='*80)

# Split dataset
train_data, vali_data, test_data = split_dataset(sp_time)

# Encode locations
enc = OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1).fit(
    train_data['location_id'].values.reshape(-1, 1)
)
train_data['location_id'] = enc.transform(train_data['location_id'].values.reshape(-1, 1)) + 2
vali_data['location_id'] = enc.transform(vali_data['location_id'].values.reshape(-1, 1)) + 2
test_data['location_id'] = enc.transform(test_data['location_id'].values.reshape(-1, 1)) + 2

# Find valid sequences for each previous_day
previous_day_ls = CONFIG['sequence_generation']['previous_days']
all_ids = sp_time[['id']].copy()

for previous_day in tqdm(previous_day_ls, desc='Finding valid sequences'):
    valid_ids = get_valid_sequence(train_data, previous_day=previous_day)
    valid_ids.extend(get_valid_sequence(vali_data, previous_day=previous_day))
    valid_ids.extend(get_valid_sequence(test_data, previous_day=previous_day))
    
    all_ids[f'{previous_day}'] = 0
    all_ids.loc[all_ids['id'].isin(valid_ids), f'{previous_day}'] = 1

# Get final valid IDs (valid for all previous_day values)
all_ids.set_index('id', inplace=True)
final_valid_id = all_ids.loc[all_ids.sum(axis=1) == all_ids.shape[1]].reset_index()['id'].values

# Filter users: must have records in all splits
valid_users_train = train_data.loc[train_data['id'].isin(final_valid_id), 'user_id'].unique()
valid_users_vali = vali_data.loc[vali_data['id'].isin(final_valid_id), 'user_id'].unique()
valid_users_test = test_data.loc[test_data['id'].isin(final_valid_id), 'user_id'].unique()
valid_users = set.intersection(set(valid_users_train), set(valid_users_vali), set(valid_users_test))

filtered_sp = sp_time.loc[sp_time['user_id'].isin(valid_users)].copy()

# Re-split filtered data
train_data, vali_data, test_data = split_dataset(filtered_sp)

# Re-encode locations
enc = OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1).fit(
    train_data['location_id'].values.reshape(-1, 1)
)
train_data['location_id'] = enc.transform(train_data['location_id'].values.reshape(-1, 1)) + 2

print(f'✓ Max location ID: {train_data["location_id"].max()}')
print(f'✓ Unique locations: {train_data["location_id"].nunique()}')

# Re-encode user IDs (continuous)
enc_user = OrdinalEncoder(dtype=np.int64)
filtered_sp['user_id'] = enc_user.fit_transform(filtered_sp['user_id'].values.reshape(-1, 1)) + 1

# Save
with open(os.path.join(CONFIG['dataset']['output_dir'], 'valid_ids_geolife.pk'), 'wb') as f:
    pickle.dump(final_valid_id, f, protocol=pickle.HIGHEST_PROTOCOL)

filtered_sp.to_csv(os.path.join(CONFIG['dataset']['output_dir'], 'dataSet_geolife.csv'), index=False)

print(f'\n✓ Final: {filtered_sp["user_id"].nunique()} users, {len(filtered_sp)} staypoints')
print(f'✓ Saved: dataSet_geolife.csv and valid_ids_geolife.pk')

---

# PART 2: Transformer Data Generation

Generate final `.pk` files for model training.

## Helper Functions for Sequence Generation

In [None]:
def get_valid_sequence_user(df, previous_day, valid_ids):
    """Get valid sequences for a single user."""
    df.reset_index(drop=True, inplace=True)
    data_single_user = []
    
    min_days = df['start_day'].min()
    df['diff_day'] = df['start_day'] - min_days
    
    for index, row in df.iterrows():
        if row['diff_day'] < previous_day:
            continue
        
        hist = df.iloc[:index]
        hist = hist.loc[(hist['start_day'] >= (row['start_day'] - previous_day))]
        
        if not (row['id'] in valid_ids):
            continue
        
        if len(hist) < 2:
            continue
        
        data_dict = {}
        data_dict['X'] = hist['location_id'].values
        data_dict['user_X'] = hist['user_id'].values
        data_dict['weekday_X'] = hist['weekday'].values
        data_dict['start_min_X'] = hist['start_min'].values
        data_dict['dur_X'] = hist['duration'].values
        data_dict['diff'] = (row['diff_day'] - hist['diff_day']).astype(int).values
        data_dict['Y'] = int(row['location_id'])
        
        data_single_user.append(data_dict)
    
    return data_single_user


def apply_parallel(dfGrouped, func, n_jobs, **kwargs):
    """Parallelize functions after groupby."""
    if n_jobs == 1:
        return dfGrouped.apply(func, **kwargs)
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(func)(group, **kwargs) for name, group in dfGrouped
    )
    return results


print('✓ Sequence generation functions defined')

## Step 10: Generate Final .pk Files

In [None]:
print('='*80)
print('GENERATING TRANSFORMER DATA FILES')
print('='*80)

# Load preprocessed data
dataset_path = os.path.join(CONFIG['dataset']['output_dir'], 'dataSet_geolife.csv')
valid_ids_path = os.path.join(CONFIG['dataset']['output_dir'], 'valid_ids_geolife.pk')

ori_data = pd.read_csv(dataset_path)
with open(valid_ids_path, 'rb') as f:
    valid_ids = pickle.load(f)

# Sort
ori_data.sort_values(by=['user_id', 'start_day', 'start_min'], inplace=True)

# Truncate long durations (>2 days → 2 days)
ori_data.loc[ori_data['duration'] > 60 * 24 * 2 - 1, 'duration'] = 60 * 24 * 2 - 1

# Split
train_data, vali_data, test_data = split_dataset(ori_data)
print(f'Split: train={len(train_data)}, val={len(vali_data)}, test={len(test_data)}')

# Encode locations
enc = OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1).fit(
    train_data['location_id'].values.reshape(-1, 1)
)
train_data['location_id'] = enc.transform(train_data['location_id'].values.reshape(-1, 1)) + 2
vali_data['location_id'] = enc.transform(vali_data['location_id'].values.reshape(-1, 1)) + 2
test_data['location_id'] = enc.transform(test_data['location_id'].values.reshape(-1, 1)) + 2

print(f'Max location ID: {train_data["location_id"].max()}')
print(f'Unique locations: {train_data["location_id"].nunique()}')

# Generate sequences
previous_day = CONFIG['sequence_generation']['previous_days'][0]
print(f'\nGenerating sequences (history={previous_day} days)...')

train_records = apply_parallel(
    train_data.groupby('user_id'),
    get_valid_sequence_user,
    n_jobs=-1,
    previous_day=previous_day,
    valid_ids=valid_ids
)
train_records = [item for sublist in train_records for item in sublist]
print(f'Train: {len(train_records)} sequences')

vali_records = apply_parallel(
    vali_data.groupby('user_id'),
    get_valid_sequence_user,
    n_jobs=-1,
    previous_day=previous_day,
    valid_ids=valid_ids
)
vali_records = [item for sublist in vali_records for item in sublist]
print(f'Validation: {len(vali_records)} sequences')

test_records = apply_parallel(
    test_data.groupby('user_id'),
    get_valid_sequence_user,
    n_jobs=-1,
    previous_day=previous_day,
    valid_ids=valid_ids
)
test_records = [item for sublist in test_records for item in sublist]
print(f'Test: {len(test_records)} sequences')

# Save .pk files
output_dir = CONFIG['dataset']['output_dir']
train_file = os.path.join(output_dir, f'geolife_transformer_{previous_day}_train.pk')
vali_file = os.path.join(output_dir, f'geolife_transformer_{previous_day}_validation.pk')
test_file = os.path.join(output_dir, f'geolife_transformer_{previous_day}_test.pk')

with open(train_file, 'wb') as f:
    pickle.dump(train_records, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f'\n✓ Saved: {train_file}')

with open(vali_file, 'wb') as f:
    pickle.dump(vali_records, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f'✓ Saved: {vali_file}')

with open(test_file, 'wb') as f:
    pickle.dump(test_records, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f'✓ Saved: {test_file}')

print('\n' + '='*80)
print('PREPROCESSING COMPLETE!')
print('='*80)
print(f'\nFinal output files in {output_dir}:')
print(f'  - geolife_transformer_{previous_day}_train.pk ({len(train_records)} sequences)')
print(f'  - geolife_transformer_{previous_day}_validation.pk ({len(vali_records)} sequences)')
print(f'  - geolife_transformer_{previous_day}_test.pk ({len(test_records)} sequences)')