In [None]:
%matplotlib widget
import os
import pickle
import numpy as np
import pandas as pd

from datetime import datetime, timezone
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [None]:
# Original path to the San Francisco dataset files
DATASET_PATH = "../original_datasets/SanFranciscoDataset/cabspottingdata.tar/cabspottingdata/"

# Map with all the files in the DATASET_PATH
INPUT_FILES = map(lambda x: DATASET_PATH  + x, os.listdir(DATASET_PATH))

# Paths where to save proccessed files

# Pickle file path with original data, without filtering based on occupancy
ORIGINAL_FILE = "../data/SanFrancisco/original_trajectories_all.pkl"

# Pickle file path with only occupied trajectories
OCCUPIED_FILE = "../data/SanFrancisco/occupied_trajectories_all.pkl"

# Pickle file path with occupied trajectories
NOT_OCCUPIED_FILE = "../data/SanFrancisco/not_occupied_trajectories_all.pkl"

# Pickle file path with all trajectories extracted
TRAJECTORIES_FILE = "../data/SanFrancisco/trajectories_all.pkl"

# Pickle file path for trajectories with additional collumns filtered based on MIN_TRAJECTORY_POINTS
FILTERED_TRAJECTORIES_FILE = "../data/SanFrancisco/filtered_trajectories.pkl"

# Pickle file path for trajectories meant for training, with lat and lon only
TRAIN_TRAJECTORIES_FILE = "../data/SanFrancisco/train_trajectories.pkl"

# Pickle file path for trajectories meant for training
TRAIN_TRAJECTORIES_TIME_DIFF_FILE = "../data/SanFrancisco/train_trajectories_time_diff.pkl"

In [2]:
EARTH_RADIUS_KM = 6371
EARTH_RADIUS_M  = 6371008.7714

# Minimum number of points in a trajectory. A trajectory with less then MIN_TRAJECTORY_POINTS is dropped
MIN_TRAJECTORY_POINTS = 25 

# Parameter to filter trajectories. Used to check if haversine distance between two points is under THRESHOLD_M
THRESHOLD_M = 100

# Parameter to filter trajectories. Allowed sampling time in seconds between trajectory points. Should be allowed 2x sampling rate.
TIME_DIFF_THRESHOLD_S = 120

# Parameter to filter trajectories. Maximum allowed speed between trajectory points in km.
SPEED_KM_THRESHOLD = 150

# The bounding box area of the city. Used to eliminate outlier/spike points outside city region.
CITY_BOUNDING_BOX = {"lat_min": 37.274, "lat_max": 38.1, "lon_min":-122.602, "lon_max": -122.019}

In [3]:
def save_object(obj, file_path):
    """
    Save python object obj in the given file_path with pickle.

    :param object obj: Python object to be saved
    :param str file_path: Path where to save obj
    :return: If the save was successful
    :rtype: bool
    """

    if obj is not None and file_path is not None:

        with open(file_path, 'wb') as file:
            pickle.dump(obj, file)

        return True
        
    return False

In [4]:
def load_object(file_path, encoding=None):
    """
    To be used after calling save_object().
    Loads a python object that was pickled with save_object()

    :param str file_path: Path to the pickled
    :param str encoding: None or type of file encoding
    :return: None if load fails, else the pickled object
    :rtype: None if fails or type of objected pickled with save_object() 
    """
    
    if file_path is not None:

        with open(file_path, 'rb') as file:

            if encoding is not None:
                return pickle.load(file, encoding='latin1')
            else:
                return pickle.load(file)

    return None

In [5]:
def timestamp_to_datetime(unix_timestamp):
    """
    Converts a unix timestamp into date and time.
    
    Example:
        unix_time: 1744097710
        gives
        date: 2025-08-04
        time: 07:35:10

    :param int unix_timestamp: Unix time as int
    :return: Tuple containing the date and time from unix_timestamp
    :rtype: touple
    """
    dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc)

    date = dt.strftime("%Y-%m-%d")
    time = dt.strftime('%H:%M:%S')

    return date, time

In [6]:
def process_csv():
    """
    Processes input files from the San Francisco dataset.
    Extracts trajectories based on the "occupancy" field.
    Adds date and time for each point in a trajectory.

    :return: A tuple containing four lists of dataframes:
                original: Original data without filtering based on occupancy
                occupied_trajectories: Trajectories based on occupancy=1
                not_occupied_trajectories: Trajectories based on occupancy=0
                trajectories: All trajectory with occupancy 1 and 0
    """
    current_trajectory = []
    original = []
    trajectories = []
    occupied_trajectories = []
    not_occupied_trajectories = []

    for i, f_name in enumerate(INPUT_FILES):
        df_original = pd.read_csv(f_name, sep=" ", names=["lat", "lon", "occupancy", "timestamp"])
        df_original = df_original.astype({"lat": float, "lon": float, "occupancy": int, "timestamp": int})

        df_copy = df_original.copy()
        df_copy[["date", "time"]] = df_copy["timestamp"].apply(timestamp_to_datetime).apply(pd.Series)

        original.append(df_copy)

        current_trajectory = []
        current_occupancy = df_copy.iloc[0]["occupancy"] if not df_copy.empty else None

        for _, row in df_copy.iterrows():
            if row["occupancy"] == current_occupancy:
                current_trajectory.append([row["lat"], row["lon"], row["timestamp"], row["date"], row["time"]])
            else:
                # Save the current trajectory to the correct list
                if current_trajectory:
                    df_traj = pd.DataFrame(current_trajectory, columns=["lat", "lon", "timestamp", "date", "time"])
                    trajectories.append(df_traj)
                    if current_occupancy == 1:
                        occupied_trajectories.append(df_traj)
                    else:
                        not_occupied_trajectories.append(df_traj)
                # Reset occupancy state
                current_trajectory = [[row["lat"], row["lon"], row["timestamp"], row["date"], row["time"]]]
                current_occupancy = row["occupancy"]

        # Save final trajectory
        if current_trajectory:
            df_traj = pd.DataFrame(current_trajectory, columns=["lat", "lon", "timestamp", "date", "time"])
            trajectories.append(df_traj)

            if current_occupancy == 1:
                occupied_trajectories.append(df_traj)
            else:
                not_occupied_trajectories.append(df_traj)

    return original, occupied_trajectories, not_occupied_trajectories, trajectories

In [7]:
def setup_dataset(from_csv=False,
                  get_original=False, 
                  get_trajectories=False,
                  get_occupied=False,
                  get_not_occupied=False):
    """
    Helper function. Loads the required data based on the function parameters.
    It can return data processed from csv or loaded from pickled files.

    :param bool from_csv: If set to true, reads data from csv files, 
                            otherwise tries to load pickled files.
                            If set to true, ignores next params set to true.
    :param bool get_original: Can be used if from_csv=False. Loads pickled original data
    :param bool get_trajectories:Can be used if from_csv=False. Loads pickled trajectories
    :param bool get_occupied: Can be used if from_csv=False. Loads pickled occupied trajectories
    :param bool get_not_occupied: Can be used if from_csv=False. Loads pickled not occupied trajectories
    """
    original = None
    occupied_trajectories = None
    not_occupied_trajectories = None
    trajectories = None

    if from_csv:
        original, occupied_trajectories, not_occupied_trajectories, trajectories = process_csv()
        save_object(original, ORIGINAL_FILE)
        save_object(occupied_trajectories, OCCUPIED_FILE)
        save_object(not_occupied_trajectories, NOT_OCCUPIED_FILE)
        save_object(trajectories, TRAJECTORIES_FILE)

        return original, occupied_trajectories, not_occupied_trajectories, trajectories
    else:
        if get_original:
            original = load_object(ORIGINAL_FILE)

        if get_trajectories:
            trajectories = load_object(TRAJECTORIES_FILE)

        if get_occupied:
            occupied_trajectories = load_object(OCCUPIED_FILE)

        if get_not_occupied:
            not_occupied_trajectories = load_object(NOT_OCCUPIED_FILE)

        return original, occupied_trajectories, not_occupied_trajectories, trajectories

In [8]:
def haversine_distance_km(lat1, lon1, lat2, lon2):
    """
    Compute great-circle distance between two lat/lon points in km.
    """

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    return haversine_distances([[lat1, lon1], [lat2, lon2]])[0, 1] * EARTH_RADIUS_KM


In [9]:
def compute_time_dist_speed(filtered_dataset):
    """
    Populates each dataframe trajectory in filtered_dataset list with the columns: time_diff, distance_km, speed_km.

    :param list filtered_dataset: List of dataframes that have lat, lon, timestamp as columns
    :return: Returns a new list where each dataframe has the above mentioned additional columns
    """
    
    filtered_trajectories = []

    for df in filtered_dataset:
        df = df.copy()
        df["lat_next"]  = df["lat"].shift(1)
        df["lon_next"] = df["lon"].shift(1)
        df["time_diff"] = np.abs(df["timestamp"].diff())

        df.loc[1:,"distance_km"] = df[1:].apply(
            lambda row: haversine_distance_km(row["lat"], row["lon"], row["lat_next"], row["lon_next"]), axis=1
        )

        df.loc[1:, "speed_km"] = df.loc[1:,"distance_km"] / (df.loc[1:,"time_diff"]/3600)

        df = df.drop(columns=["lat_next", "lon_next"])
        filtered_trajectories.append(df)
    
    return filtered_trajectories

In [10]:
def trajectory_under_threshold(df, threshold_m=THRESHOLD_M):
    """
    Usefull function to check if a given trajectory is in a given square 
    based on lat and lon min/max values. If the haversine distance in meters
    is under threshold_m returns True, otherwise False.
    """

    max = [ df["lat"].max(), df["lon"].max()] 
    min = [ df["lat"].min(), df["lon"].min()] 

    max_in_radians = [radians(_) for _ in max]
    min_in_radians = [radians(_) for _ in min]


    distance_m = haversine_distances([max_in_radians, min_in_radians]) * EARTH_RADIUS_M

    if distance_m[0, 1] < threshold_m:
        return True
    
    return False

In [11]:
def filter_trajectories_for_training(filtered_trajectories,
                                    time_diff_threshold_s=TIME_DIFF_THRESHOLD_S,
                                    max_min_threshold_m=THRESHOLD_M,
                                    speed_km_threshold=SPEED_KM_THRESHOLD,
                                    city_bounding_box=CITY_BOUNDING_BOX):
    """
    Function to filter a list of dataframe trajectories based on several criteria:
        - time_diff_threshold_s: Allowed sampling time in seconds between trajectory points. Should be allowed 2x sampling rate.
        - max_min_threshold_m: Allowed distance in meters between lat and lon min/max. See trajectory_under_threshold().
        - speed_km_threshold: Maximum allowed speed between trajectory points in km.
        - city_bounding_box: The bounding box area of the city. Used to eliminate outlier/spike points outside city region.
    """
    for_train_trajectories = []

    for df in filtered_trajectories:

        if (df["time_diff"] > time_diff_threshold_s).any():
            continue

        if trajectory_under_threshold(df, threshold_m=max_min_threshold_m):
            continue

        if (df["speed_km"] >= speed_km_threshold).any():
            continue

        mask_lat = df["lat"].between(city_bounding_box["lat_min"], city_bounding_box["lat_max"]).all()
        mask_lon = df["lon"].between(city_bounding_box["lon_min"], city_bounding_box["lon_max"]).all()

        if not mask_lat or not mask_lon:
            continue

        for_train_trajectories.append(df)
    
    return for_train_trajectories


In [19]:
"""
Data Loading
The setup_dataset() function allows you to load original csv files, or based on function parameters to load already processed pickled files.
"""
_, _, _, trajectories = setup_dataset(get_trajectories=True)

##### Trajectory count

In [13]:
traj_lengths = [len(df) for df in trajectories]

less_than_5 = sum(1 for l in traj_lengths if l <= 5)
between_5_10 = sum(1 for l in traj_lengths if 5 <= l <= 10)
between_10_20 = sum(1 for l in traj_lengths if 10 <= l <= 20)
between_20_30 = sum(1 for l in traj_lengths if 20 <= l <= 30)
between_30_40 = sum(1 for l in traj_lengths if 30 <= l <= 40)
over_40 = sum(1 for l in traj_lengths if l >= 40)
over_25 = sum(1 for l in traj_lengths if l >= 25) # Current Sequence Lenght

print(f"Total Trajectories:{len(trajectories)}")

print(f"Less then 5: {less_than_5}")
print(f"Between 5 and 10: {between_5_10}")
print(f"Between 10 and 20: {between_10_20}")
print(f"Between 20 and 30: {between_20_30}")
print(f"Between 30 and 40: {between_30_40}")
print(f"Over 40: {over_40}")
print(f"Over 25: {over_25}")

Total Trajectories:928301
Less then 5: 293261
Between 5 and 10: 341024
Between 10 and 20: 279743
Between 20 and 30: 78556
Between 30 and 40: 26836
Over 40: 33767
Over 25: 85361


##### Trajectory filtering

In [15]:
# Filter based on minimum points needed in a trajectory
# Change MIN_TRAJECTORY_POINTS in the **Configuration parameters** block to set the minimum number of points wanted in a trajectory.
filtered_dataset = [df for df in trajectories if len(df['lat']) >= MIN_TRAJECTORY_POINTS]

In [14]:
print(f"Total Trajectories:{len(trajectories)}")
print(f"Filter on sequence len {len(filtered_dataset)}")

Total Trajectories:928301
Filter on sequence len 85361


In [None]:
filtered_trajectories = compute_time_dist_speed(filtered_dataset)

In [None]:
save_object(filtered_trajectories, FILTERED_TRAJECTORIES_FILE)

True

In [1]:
# filtered_trajectories = load_object(FILTERED_TRAJECTORIES_FILE)

In [19]:
for_train_trajectories = filter_trajectories_for_training(filtered_trajectories)

In [24]:
for_train_trajectories_time_diff = [df.drop(columns=["timestamp", "date", "time"]) for df in for_train_trajectories]

In [27]:
save_object(for_train_trajectories_time_diff, TRAIN_TRAJECTORIES_TIME_DIFF_FILE)

True

In [20]:
print(f"Trajectories after filtering {len(for_train_trajectories)}")

Trajectories after filtering 30351


In [24]:
to_save = [df.drop(columns=["time_diff", "timestamp", "distance_km", "speed_km", "date", "time"]) for df in for_train_trajectories]

In [None]:
save_object(to_save, TRAIN_TRAJECTORIES_FILE)

True