# Porto trajectory processing

The current notebook work with the [Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015](https://archive.ics.uci.edu/dataset/339/taxi+service+trajectory+prediction+challenge+ecml+pkdd+2015) dataset.

In [None]:
%matplotlib widget
import os
import pickle
import ast
import numpy as np
import pandas as pd

from datetime import datetime, timezone
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [2]:
# Path to dataset file. The original train.csv is around 1.8GB. To create smaller files you can use the following command to extract lines from train.csv:
# $head -n50000 train.csv > train_50000.csv
READ_PATH = "../original_datasets/Porto-UCI/train.csv/train_last_100000.csv"

# Path where to pickle original data
WRITE_PATH_ORG = "../datasets/Porto/porto_uci_31k_org_drop_only.pkl"

# Path where to pickle trajectories
WRITE_PATH_TRAJ = "../datasets/Porto/porto_uci_31k_traj_drop_only.pkl"

# Path where to pickle trajectories
WRITE_PATH_TRAJ_TIME_DIFF = "../datasets/Porto/porto_uci_31k_traj_time_diff.pkl"

# Path where to pickle trajectories
WRITE_PATH_TRAJ_ANALYSIS = "../datasets/Porto/porto_uci_analysis.pkl"


# Path where to pickle trajectories
WRITE_PATH_TRAJ_ANALYSIS_60k = "../datasets/Porto/porto_uci_analysis_60k.pkl"

# Columns of the original dataset (train.csv)
columns = ["TRIP_ID","CALL_TYPE","ORIGIN_CALL","ORIGIN_STAND","TAXI_ID","TIMESTAMP","DAY_TYPE","MISSING_DATA","POLYLINE"]

# Columns not needed in each row of the dataset
columns_to_drop = ['CALL_TYPE', 'ORIGIN_CALL', "ORIGIN_STAND", 'DAY_TYPE']

# Chunk size parameter for the pandas' read_csv() function.
# See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
chunk_size = 10000

# CSV file delimiter for columns
delimiter = ","

In [3]:
EARTH_RADIUS_KM = 6371
EARTH_RADIUS_M  = 6371008.7714

# Minimum number of points in a trajectory. A trajectory with less then MIN_TRAJECTORY_POINTS is dropped
MIN_TRAJECTORY_POINTS = 25 

# Parameter to filter trajectories. Used to check if haversine distance between two points is under THRESHOLD_M
THRESHOLD_M = 100

# Parameter to filter trajectories. Allowed sampling time in seconds between trajectory points. Should be allowed 2x sampling rate.
TIME_DIFF_THRESHOLD_S = 120

# Parameter to filter trajectories. Maximum allowed speed between trajectory points in km.
SPEED_KM_THRESHOLD = 150

In [4]:
def load_object(file_path, encoding=None):
    """
    To be used after calling save_object().
    Loads a python object that was pickled with save_object()

    :param str file_path: Path to the pickled
    :param str encoding: None or type of file encoding
    :return: None if load fails, else the pickled object
    :rtype: None if fails or type of objected pickled with save_object() 
    """
    
    if file_path is not None:

        with open(file_path, 'rb') as file:

            if encoding is not None:
                return pickle.load(file, encoding='latin1')
            else:
                return pickle.load(file)

    return None

In [5]:
def save_object(obj, file_path):
    """
    Save python object obj in the given file_path with pickle.

    :param object obj: Python object to be saved
    :param str file_path: Path where to save obj
    :return: If the save was successful
    :rtype: bool
    """

    if obj is not None and file_path is not None:

        with open(file_path, 'wb') as file:
            pickle.dump(obj, file)

        return True
        
    return False

In [6]:
def haversine_distance_km(lat1, lon1, lat2, lon2):
    """
    Compute great-circle distance between two lat/lon points in km.
    """

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    return haversine_distances([[lat1, lon1], [lat2, lon2]])[0, 1] * EARTH_RADIUS_KM


In [7]:
def polyline_as_list(val):
    """
    The current dataset (train.csv) stores polylines as string, containing a list of lists. This function convert a str param val into a list of lists.
    """
    if isinstance(val, list):
        return val
    if pd.isna(val) or val in ["", "[]"]:
        return []
    try:
        return ast.literal_eval(val)
    except (Exception):
        return []

In [8]:
def timestamp_to_datetime(unix_timestamp):
    """
    Converts a unix timestamp into date and time.
    
    Example:
        unix_time: 1744097710
        gives
        date: 2025-08-04
        time: 07:35:10

    :param int unix_timestamp: Unix time as int
    :return: Tuple containing the date and time from unix_timestamp
    :rtype: touple
    """
    dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc)

    date = dt.strftime("%Y-%m-%d")
    time = dt.strftime('%H:%M:%S')

    return date, time

In [12]:
# Generator to read large size file
def csv_dataset_reader(file_path, chunk_size, columns, delimiter, dtype):
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, names=columns, delimiter=delimiter, dtype=dtype):
        yield chunk

In [13]:
data_reader = csv_dataset_reader(READ_PATH, chunk_size=chunk_size, columns=columns, delimiter=delimiter, dtype=object)

In [None]:
def process_large_gps_file_batched(
    file_path, chunk_size, columns, columns_to_drop,
    delimiter, dtype, output_dir, batch_size=50000
):
    os.makedirs(output_dir, exist_ok=True)
    reader = csv_dataset_reader(file_path, chunk_size, columns, delimiter, dtype)

    batch = []
    file_counter = 0
    traj_counter = 0

    for data_chunks in reader:
        data_chunks.drop(columns=columns_to_drop, inplace=True)

        for _, row in data_chunks.iterrows():
            polyline = polyline_as_list(row.POLYLINE)
            if len(polyline) < 25:
                continue

            lats = np.array([pt[1] for pt in polyline], dtype=np.float32)
            lons = np.array([pt[0] for pt in polyline], dtype=np.float32)
            n = len(lats)

            timestamps = int(row["TIMESTAMP"]) + np.arange(n) * 15

            timestamps = timestamps.astype(np.int64)
            time_diff = np.diff(timestamps, prepend=timestamps[0])

            lat_next = np.roll(lats, 1)
            lon_next = np.roll(lons, 1)
            lat_next[0] = np.nan
            lon_next[0] = np.nan

            distances = np.full(n, np.nan, dtype=np.float32)
            speeds = np.full(n, np.nan, dtype=np.float32)

            for i in range(1, n):
                distances[i] = haversine_distance_km(lats[i], lons[i], lat_next[i], lon_next[i])
                speeds[i] = distances[i] / (time_diff[i] / 3600) if time_diff[i] > 0 else 0

            timestamps_dt = [timestamp_to_datetime(ts) for ts in timestamps]
            date_time_df = pd.DataFrame(timestamps_dt, columns=["date", "time"])

            df = pd.DataFrame({
                "lat": lats,
                "lon": lons,
                "timestamp": timestamps,
                "time_diff": time_diff,
                "distance_km": distances,
                "speed_km": speeds,
                "missing": row["MISSING_DATA"],
                "trajectory_id": traj_counter
            })

            df = pd.concat([df, date_time_df], axis=1)

            batch.append(df)
            traj_counter += 1

            if len(batch) >= batch_size:
                save_object(batch, f"{output_dir}/trajectories_{file_counter:03d}.pkl")
                print(f"Saved file trajectories_{file_counter:03d}.pkl with {len(batch)} trajectories")
                batch.clear()
                file_counter += 1

    # Write remaining data
    if batch:
        save_object(batch, f"{output_dir}/trajectories_{file_counter:03d}.pkl")
        print(f"Saved file trajectories_{file_counter:03d}.pkl with {len(batch)} trajectories")

In [15]:
process_large_gps_file_batched(file_path="../original_datasets/Porto-UCI/train.csv/train.csv",
                               chunk_size=chunk_size,
                              columns=columns,
                              columns_to_drop=columns_to_drop,
                              delimiter=delimiter,
                              dtype=object,
                              output_dir="../datasets/Porto/all/",
                              batch_size=50000)

Saved file trajectories_000.pkl with 50000 trajectories
Saved file trajectories_001.pkl with 50000 trajectories
Saved file trajectories_002.pkl with 50000 trajectories
Saved file trajectories_003.pkl with 50000 trajectories
Saved file trajectories_004.pkl with 50000 trajectories
Saved file trajectories_005.pkl with 50000 trajectories
Saved file trajectories_006.pkl with 50000 trajectories
Saved file trajectories_007.pkl with 50000 trajectories
Saved file trajectories_008.pkl with 50000 trajectories
Saved file trajectories_009.pkl with 50000 trajectories
Saved file trajectories_010.pkl with 50000 trajectories
Saved file trajectories_011.pkl with 50000 trajectories
Saved file trajectories_012.pkl with 50000 trajectories
Saved file trajectories_013.pkl with 50000 trajectories
Saved file trajectories_014.pkl with 50000 trajectories
Saved file trajectories_015.pkl with 50000 trajectories
Saved file trajectories_016.pkl with 50000 trajectories
Saved file trajectories_017.pkl with 50000 traje

In [None]:
# filtered_dataset = [df for df in dataset if len(df['lat']) >= 25]

In [9]:
def trajectory_under_threshold(df, threshold=100):
    # Returns true if distance between max min lat is under threshold

    max = [ df["lat"].max(), df["lon"].max()] 
    min = [ df["lat"].min(), df["lon"].min()] 

    max_in_radians = [radians(_) for _ in max]
    min_in_radians = [radians(_) for _ in min]


    distance_m = haversine_distances([max_in_radians, min_in_radians]) * 6371008.7714

    if distance_m[0, 1] < threshold:
        return True
    
    return False

In [None]:
input_dir = "../datasets/Porto/all/"
output_dir = "../datasets/Porto/all/filtered/"
os.makedirs(output_dir, exist_ok=True)

for fname in sorted(os.listdir(input_dir)):
    
    if fname.endswith(".pkl"):
        
        file_path = os.path.join(input_dir, fname)
        dataset = load_object(file_path)
        
        if dataset is None:
            continue


        if any((df['missing'] == True).any() for df in dataset):
            print(f"Skipping {fname} due to missing data.")
            continue
        else:
            filtered_trajectories = []

            for traj in dataset:
                
                if (traj["speed_km"] >= 150).any():
                    continue

                if trajectory_under_threshold(traj):
                    continue

                if not traj["lon"].between(-9.0, -8.0).all() or not traj["lat"].between(40.1, 41.7).all():
                    continue

                traj.drop(columns=["missing"])

                filtered_trajectories.append(traj)


            save_object(dataset, os.path.join(output_dir, fname))
            print(f"Saved filtered: {fname}")

Saved filtered: trajectories_000.pkl
Saved filtered: trajectories_001.pkl
Saved filtered: trajectories_002.pkl
Saved filtered: trajectories_003.pkl
Saved filtered: trajectories_004.pkl
Saved filtered: trajectories_005.pkl
Saved filtered: trajectories_006.pkl
Saved filtered: trajectories_007.pkl
Saved filtered: trajectories_008.pkl
Saved filtered: trajectories_009.pkl
Saved filtered: trajectories_010.pkl
Saved filtered: trajectories_011.pkl
Saved filtered: trajectories_012.pkl
Saved filtered: trajectories_013.pkl
Saved filtered: trajectories_014.pkl
Saved filtered: trajectories_015.pkl
Saved filtered: trajectories_016.pkl
Saved filtered: trajectories_017.pkl
Saved filtered: trajectories_018.pkl
Saved filtered: trajectories_019.pkl
Saved filtered: trajectories_020.pkl
Saved filtered: trajectories_021.pkl
Saved filtered: trajectories_022.pkl
Saved filtered: trajectories_023.pkl
Saved filtered: trajectories_024.pkl
Saved filtered: trajectories_025.pkl
Saved filtered: trajectories_026.pkl
S

In [None]:
filtered_trajectories = []

for traj in filtered_dataset:
    
    if (traj["speed_km"] >= 150).any():
        continue

    if trajectory_under_threshold(traj):
        continue

    if not traj["lon"].between(-9.0, -8.0).all() or not traj["lat"].between(40.1, 41.7).all():
        continue

    # filtered_trajectories.append(fix_latlon_spikes_sklearn(traj, threshold_meters=300))
    filtered_trajectories.append(traj)

In [None]:
len(filtered_trajectories)

72575

In [None]:
filtered_trajectories_analisys = [df.drop(columns=["missing"]) for df in filtered_trajectories]

In [None]:
save_object(filtered_trajectories_analisys, "../datasets/Porto/porto_uci_analysis_last_60k.pkl")

True

In [None]:
filtered_trajectories_time_diff = [df.drop(columns=["timestamp"]) for df in filtered_trajectories_analisys]

In [None]:
# filtered_trajectories_time_diff = load_object(WRITE_PATH_TRAJ_TIME_DIFF)

In [None]:
# lat lon
# save_time_diff = [df[["lat", "lon", "time_diff", "distance_km", "speed_km"]] for df in filtered_trajectories_time_diff]
save_object(filtered_trajectories_time_diff, WRITE_PATH_TRAJ_TIME_DIFF)

In [None]:
to_save = [df.drop(columns=["time_diff", "timestamp", "distance_km", "speed_km", "missing"]) for df in filtered_trajectories]

In [None]:
save_object(dataset, WRITE_PATH_ORG)
save_object(to_save, WRITE_PATH_TRAJ)

True

In [None]:
test = load_object(WRITE_PATH_TRAJ_ANALYSIS)

In [None]:
test[0].head()

Unnamed: 0,lat,lon,timestamp,time_diff,distance_km,speed_km
0,41.18049,-8.645994,1372637091,,,
1,41.180517,-8.645949,1372637106,15.0,0.004816,1.155908
2,41.180049,-8.646048,1372637121,15.0,0.052695,12.646719
3,41.178888,-8.646804,1372637136,15.0,0.143768,34.504334
4,41.178465,-8.649495,1372637151,15.0,0.230074,55.217828
