# Data Preprocessing Notebook

This notebook will:

✔ Load GeoLife raw data

✔ Clean GPS data

✔ Convert lat/lon → place IDs

✔ Create sequences for HMM

✔ Build graph data for GNN

✔ Save final processed outputs into **`data/processed/`**


## Section 1 — Imports & Setup

This section imports all required libraries and sets up path variables.


In [16]:
import os
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from haversine import haversine
import json
import networkx as nx
import random
from sklearn.preprocessing import LabelEncoder
import pickle

# Use only 20% of the data
DATA_SAMPLE_RATIO = 0.3
random.seed(42)  # For reproducibility
np.random.seed(42)

RAW_DATA_PATH = "/home/root495/Inexture/Location Prediction Update/data/raw/data/Geolife Trajectories 1.3/Data"
PROCESSED_PATH = "/home/root495/Inexture/Location Prediction Update/data/processed/"

os.makedirs(PROCESSED_PATH, exist_ok=True)


## Section 2 — Read GeoLife PLT Files

This section loads raw .plt files and converts them into a structured DataFrame.


In [17]:
def read_plt(file_path):
    df = pd.read_csv(
        file_path,
        skiprows=6,
        names=["lat","lon","zero","alt","days","date","time"]
    )
    df["timestamp"] = pd.to_datetime(df["date"] + " " + df["time"])
    return df[["lat","lon","timestamp"]]

all_data = []
user_dirs = glob.glob(RAW_DATA_PATH + "/*")
print(f"Found {len(user_dirs)} user directories")

# Sample 20% of users
num_users_to_sample = max(1, int(len(user_dirs) * DATA_SAMPLE_RATIO))
sampled_users = random.sample(user_dirs, num_users_to_sample)
print(f"Sampling {num_users_to_sample} users ({DATA_SAMPLE_RATIO*100:.0f}% of total)")

for user in tqdm(sampled_users, desc="Loading trajectories"):
    user_id = os.path.basename(user)
    traj_path = os.path.join(user, "Trajectory")
    if os.path.exists(traj_path):
        plt_files = glob.glob(traj_path + "/*.plt")
        for plt in plt_files:
            try:
                df = read_plt(plt)
                df["user"] = user_id
                all_data.append(df)
            except Exception as e:
                print(f"Error reading {plt}: {e}")
                continue

if len(all_data) == 0:
    raise ValueError(f"No data loaded! Check if path exists: {RAW_DATA_PATH}")

raw_df = pd.concat(all_data, ignore_index=True)
print(f"Loaded {len(raw_df)} GPS points from {len(all_data)} trajectory files")
print(f"This represents approximately {DATA_SAMPLE_RATIO*100:.0f}% of the full dataset")
raw_df.head()


Found 182 user directories
Sampling 54 users (30% of total)


Loading trajectories: 100%|██████████| 54/54 [00:38<00:00,  1.39it/s]


Loaded 8699340 GPS points from 8480 trajectory files
This represents approximately 30% of the full dataset


Unnamed: 0,lat,lon,timestamp,user
0,39.95075,116.334333,2007-08-08 00:36:58,146
1,39.951967,116.337633,2007-08-08 00:37:18,146
2,39.952533,116.33845,2007-08-08 00:43:00,146
3,39.952933,116.3409,2007-08-08 00:44:09,146
4,39.953883,116.34105,2007-08-08 00:44:58,146


## Section 3 — Clean the GPS Data

We now clean the dataset:

✔ Sort by timestamp

✔ Remove duplicates

✔ Remove impossible jumps (> 60 m/s)

✔ Split on big time gaps

✔ Remove trajectories with < 10 points


### Sorting and duplicates


In [18]:
raw_df = raw_df.sort_values(["user", "timestamp"])
raw_df = raw_df.drop_duplicates(subset=["user", "timestamp"])


### Remove unrealistic jumps


In [19]:
def clean_user(df, max_speed=60):
    """Optimized version using vectorized operations"""
    df = df.sort_values("timestamp").copy()
    
    # Filter invalid coordinates first
    valid_coords = (
        (df["lat"] >= -90) & (df["lat"] <= 90) &
        (df["lon"] >= -180) & (df["lon"] <= 180)
    )
    df = df[valid_coords].copy()
    
    if len(df) < 2:
        return df[[]]  # Return empty dataframe with same structure
    
    # Vectorized calculation of distances and speeds
    df["lat_prev"] = df["lat"].shift(1)
    df["lon_prev"] = df["lon"].shift(1)
    df["time_prev"] = df["timestamp"].shift(1)
    
    # Calculate time differences in seconds
    df["dt"] = (df["timestamp"] - df["time_prev"]).dt.total_seconds()
    
    # Vectorized haversine distance calculation
    # Convert to radians
    lat1_rad = np.radians(df["lat_prev"].values)
    lat2_rad = np.radians(df["lat"].values)
    lon1_rad = np.radians(df["lon_prev"].values)
    lon2_rad = np.radians(df["lon"].values)
    
    # Haversine formula (vectorized)
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance_km = 6371 * c  # Earth radius in km
    distance_m = distance_km * 1000
    
    # Calculate speed (m/s)
    df["speed"] = distance_m / df["dt"].replace(0, np.nan)
    df["speed"] = df["speed"].fillna(0)
    
    # Filter by max speed
    mask = (df["speed"] < max_speed) | (df["speed"].isna())
    return df[mask].drop(columns=["lat_prev", "lon_prev", "time_prev", "dt"])

print("Cleaning GPS data (this may take a few minutes with 20% data)...")
cleaned_list = []
for user, group in tqdm(raw_df.groupby("user"), desc="Cleaning users"):
    cleaned = clean_user(group)
    if len(cleaned) > 0:
        cleaned_list.append(cleaned)

if len(cleaned_list) == 0:
    raise ValueError("No data remaining after cleaning!")

cleaned_df = pd.concat(cleaned_list, ignore_index=True)
print(f"Cleaned data: {len(cleaned_df)} GPS points (removed {len(raw_df) - len(cleaned_df)} points)")


Cleaning GPS data (this may take a few minutes with 20% data)...


Cleaning users: 100%|██████████| 54/54 [00:03<00:00, 16.88it/s]


Cleaned data: 8671497 GPS points (removed 18494 points)


### Remove very small trajectories


In [20]:
cleaned_df["traj_id"] = cleaned_df.groupby("user").cumcount()
cleaned_df.to_csv(PROCESSED_PATH + "cleaned_points.csv", index=False)
cleaned_df.head()


Unnamed: 0,lat,lon,timestamp,user,speed,traj_id
0,39.984702,116.318417,2008-10-23 02:53:04,0,0.0,0
1,39.984683,116.31845,2008-10-23 02:53:10,0,0.586148,1
2,39.984686,116.318417,2008-10-23 02:53:15,0,0.56626,2
3,39.984688,116.318385,2008-10-23 02:53:20,0,0.547087,3
4,39.984655,116.318263,2008-10-23 02:53:25,0,2.204602,4


## Section 4 — Convert Lat/Lon → Grid Place IDs

We use 200m x 200m grid encoding. Each grid cell = a place_id.

This satisfies the proposal requirement for grid-based encoding.


In [21]:
def add_grid_place_ids(df, cell_size_m=10000):
    deg_lat = cell_size_m / 111320
    mean_lat = df["lat"].mean()
    deg_lon = cell_size_m / (111320 * np.cos(np.radians(mean_lat)))
    min_lat = df["lat"].min()
    min_lon = df["lon"].min()
    df["row"] = ((df["lat"] - min_lat) / deg_lat).astype(int)
    df["col"] = ((df["lon"] - min_lon) / deg_lon).astype(int)
    df["place_id"] = df["row"].astype(str) + "_" + df["col"].astype(str)
    return df, (min_lat, min_lon, deg_lat, deg_lon)

cleaned_df, grid_meta = add_grid_place_ids(cleaned_df)
cleaned_df.to_csv(PROCESSED_PATH+"cleaned_with_places.csv", index=False)


## Section 4.5 — Filter Frequent Places

Filter to keep only places that appear frequently enough to reduce sparsity and improve HMM training quality.


In [22]:
# Count place visit frequencies
print("Counting place visit frequencies...")
place_counts = cleaned_df["place_id"].value_counts()

print(f"Original unique places: {len(place_counts)}")
print(f"Total place visits: {place_counts.sum()}")

# Filter to keep only frequent places
MIN_VISITS = 20  # Minimum number of visits to keep a place
frequent_places = set(place_counts[place_counts >= MIN_VISITS].index)

print(f"\nPlaces with >= {MIN_VISITS} visits: {len(frequent_places)}")
print(f"Reduction: {(1 - len(frequent_places)/len(place_counts))*100:.1f}% of places removed")

# Calculate coverage
frequent_visits = place_counts[place_counts.index.isin(frequent_places)].sum()
coverage = frequent_visits / place_counts.sum() * 100
print(f"Coverage: {coverage:.1f}% of visits retained")

# Filter cleaned_df to only include frequent places
original_size = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df["place_id"].isin(frequent_places)].copy()
filtered_size = len(cleaned_df)

print(f"\nFiltered data: {filtered_size:,} GPS points (removed {original_size - filtered_size:,} points)")
print(f"Data retention: {filtered_size/original_size*100:.1f}%")

# Update cleaned_with_places.csv with filtered data
cleaned_df.to_csv(PROCESSED_PATH+"cleaned_with_places.csv", index=False)
print(f"\nUpdated cleaned_with_places.csv with filtered data")


Counting place visit frequencies...
Original unique places: 2297
Total place visits: 8671497

Places with >= 20 visits: 2073
Reduction: 9.8% of places removed
Coverage: 100.0% of visits retained

Filtered data: 8,669,606 GPS points (removed 1,891 points)
Data retention: 100.0%



Updated cleaned_with_places.csv with filtered data


## Section 5 — Create Sequences for HMM

We now convert place_id sequences for each user trajectory.

These sequences will be saved as JSON for HMM training.


In [23]:
# Create sequences from filtered data
sequences = {}
for user, group in cleaned_df.groupby("user"):
    seq = group.sort_values("timestamp")["place_id"].tolist()
    # Keep sequences with at least 10 places after filtering
    if len(seq) > 10:
        sequences[user] = seq

print(f"Created {len(sequences)} sequences for HMM training")
if len(sequences) > 0:
    avg_seq_len = np.mean([len(seq) for seq in sequences.values()])
    print(f"Average sequence length: {avg_seq_len:.1f} places")

with open(PROCESSED_PATH + "place_sequences.json", "w") as f:
    json.dump(sequences, f)
    
print(f"Saved sequences to {PROCESSED_PATH}place_sequences.json")


Created 54 sequences for HMM training
Average sequence length: 160548.3 places
Saved sequences to /home/root495/Inexture/Location Prediction Update/data/processed/place_sequences.json


## Section 6 — Build Graph for GNN

GNN needs:

✔ Nodes = place IDs

✔ Edges = transitions

✔ Edge weights = frequency

✔ Node features = aggregated stats (speed, visits)

We'll create:

- graph_edges.csv

- node_features.csv


### Step 6.1: Build edges


In [24]:
G = nx.DiGraph()
for user, group in cleaned_df.groupby("user"):
    group = group.sort_values("timestamp")
    places = group["place_id"].tolist()
    for i in range(len(places) - 1):
        a, b = places[i], places[i+1]
        if G.has_edge(a, b):
            G[a][b]["weight"] += 1
        else:
            G.add_edge(a, b, weight=1)


Save edges:


In [25]:
edge_list = []
for u, v, w in G.edges(data=True):
    edge_list.append([u, v, w["weight"]])

pd.DataFrame(edge_list, columns=["source","target","weight"])\
  .to_csv(PROCESSED_PATH+"graph_edges.csv", index=False)


### Step 6.2: Build Node Features


In [26]:
node_features = cleaned_df.groupby("place_id").agg({
    "lat": "mean",
    "lon": "mean",
    "speed": "mean"
}).reset_index()
node_features.to_csv(PROCESSED_PATH+"node_features.csv", index=False)


## Section 7 — Save Mapping Metadata

We store the grid conversion metadata required for inverse mapping.


In [27]:
with open(PROCESSED_PATH+"grid_metadata.json","w") as f:
    json.dump({
        "min_lat": grid_meta[0],
        "min_lon": grid_meta[1],
        "deg_lat": grid_meta[2],
        "deg_lon": grid_meta[3],
    }, f)


In [28]:
raw_df.head()

Unnamed: 0,lat,lon,timestamp,user
6104308,39.984702,116.318417,2008-10-23 02:53:04,0
6104309,39.984683,116.31845,2008-10-23 02:53:10,0
6104310,39.984686,116.318417,2008-10-23 02:53:15,0
6104311,39.984688,116.318385,2008-10-23 02:53:20,0
6104312,39.984655,116.318263,2008-10-23 02:53:25,0


In [29]:
node_features.head()

Unnamed: 0,place_id,lat,lon,speed
0,0_1966,13.417182,103.78838,13.447363
1,0_1967,13.400642,103.85978,5.872515
2,0_1968,13.44089,103.929951,10.994278
3,100_2056,22.348841,114.084405,21.895303
4,100_2057,22.414922,114.209031,1.963455


In [30]:
sequences

{'000': ['296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',
  '296_2075',

## Section 8 — Summary

✔ cleaned_points.csv — Cleaned GPS points  

✔ cleaned_with_places.csv — GPS + place IDs  

✔ place_sequences.json — For HMM  

✔ graph_edges.csv — For GNN graph  

✔ node_features.csv — Node features for GNN  

✔ grid_metadata.json — Grid parameters
