# Data Processing Pipeline

In [113]:
import importlib
import preprocess_data
import regularize_tracks
import trajectory_segmentation
importlib.reload(preprocess_data)
importlib.reload(regularize_tracks)
importlib.reload(trajectory_segmentation)
from regularize_tracks import resample_segment

from preprocess_data import process_zip, drop_duplicate_messages, save_parquet_partitioned
from trajectory_segmentation import segment_trajectories
from download_data import download_ais_range

import matplotlib.pyplot as plt
import os
import glob
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


## Input Parameters

In [None]:
START_DATE =  datetime(2025, 8, 1) # None to skip downloading
END_DATE  = datetime(2025, 8, 3) # None to skip downloading

# Directory where to store ZIP files
ZIP_DIR = os.path.join("data", "aisdk", "raw")
# Where to store the Parquet dataset
PARQUET_DIR = os.path.join("data", "aisdk", "interim", "aisdk_2025")

# Final Parquet file path
PARQUET_FILE_FINAL = os.path.join("data", "aisdk", "processed", "aisdk_2025")

## Data Download

In [36]:
if START_DATE and END_DATE:
    download_ais_range(start_date = START_DATE,
                       end_date = END_DATE,
                       output_dir = ZIP_DIR)

Downloading AIS data to: data/ais_aug_2025
Date range: 2025-08-01 → 2025-08-03 (exclusive)

→ 2025-08-01: http://aisdata.ais.dk/aisdk-2025-08-01.zip


KeyboardInterrupt: 

## Data Preprocessing

In [97]:
os.makedirs(PARQUET_DIR, exist_ok=True)

# Process all August 2025 ZIP files
pattern = os.path.join(ZIP_DIR, "aisdk-2025-08-*.zip")
zip_files = sorted(glob.glob(pattern))

print(f"Found {len(zip_files)} ZIP files to process.")

for zp in zip_files:
    process_zip(zp, PARQUET_DIR)

print("All files processed.")

Found 2 ZIP files to process.

=== Processing ../../../data/ais_aug_2025/aisdk-2025-08-01.zip ===
Output path: data/ais_aug_2025_parquet
Reading ../../../data/ais_aug_2025/aisdk-2025-08-01.zip ...
Columns in raw DF: ['# Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'SOG', 'COG', 'Heading', 'Ship type']
Filtered to Ship type == Cargo and dropped column.
Applied geographic bounding box (60, 0, 50, 20).
Filtered to Type of mobile in ['Class A', 'Class B'] and dropped column.
Applied MMSI format and MID filters.
Parsed Timestamp column.
Converted SOG from knots to m/s.
Final columns: ['Timestamp', 'MMSI', 'Latitude', 'Longitude', 'SOG', 'COG', 'Heading', 'UTM_x', 'UTM_y', 'UTM_zone', 'UTM_letter']
Rows after filtering: 3470997
Saving to parquet dataset at data/ais_aug_2025_parquet ...
Parquet save done.
=== Done for ../../../data/ais_aug_2025/aisdk-2025-08-01.zip ===


=== Processing ../../../data/ais_aug_2025/aisdk-2025-08-02.zip ===
Output path: data/ais_aug_2025_parquet

In [98]:
df = pd.read_parquet(PARQUET_DIR)

In [100]:
df.head()

Unnamed: 0,Timestamp,Latitude,Longitude,SOG,COG,Heading,UTM_x,UTM_y,UTM_zone,UTM_letter,MMSI,Trajectory
0,2025-08-01 00:02:24,57.112585,12.245682,0.0,167.7,188.0,333212.967255,6333286.0,33,V,205136000,
1,2025-08-01 00:08:20,57.11259,12.245678,0.0,17.2,188.0,333212.747587,6333286.0,33,V,205136000,
2,2025-08-01 00:08:24,57.11259,12.245678,0.0,17.2,188.0,333212.747587,6333286.0,33,V,205136000,
3,2025-08-01 00:14:21,57.112583,12.245682,0.0,135.0,188.0,333212.958265,6333286.0,33,V,205136000,
4,2025-08-01 00:14:24,57.112583,12.245682,0.0,135.0,188.0,333212.958265,6333286.0,33,V,205136000,


## Create trajectories

In [101]:
df = drop_duplicate_messages(df)
df = segment_trajectories(df, 
                              sog_threshold=0.5, # 1 knot in m/s
                             position_threshold=50, # 50 meters
                              time_threshold=30 # 30 minutes
                             )

Dropped 6409610 duplicate (Timestamp, MMSI) rows.
Processing ship 1
Processing ship 2
Processing ship 3
Processing ship 4
Processing ship 5
Processing ship 6
Processing ship 7
Processing ship 8
Processing ship 9
Processing ship 10
Processing ship 11
Processing ship 12
Processing ship 13
Processing ship 14
Processing ship 15
Processing ship 16
Processing ship 17
Processing ship 18
Processing ship 19
Processing ship 20
Processing ship 21
Processing ship 22
Processing ship 23
Processing ship 24
Processing ship 25
Processing ship 26
Processing ship 27
Processing ship 28
Processing ship 29
Processing ship 30
Processing ship 31
Processing ship 32
Processing ship 33
Processing ship 34
Processing ship 35
Processing ship 36
Processing ship 37
Processing ship 38
Processing ship 39
Processing ship 40
Processing ship 41
Processing ship 42
Processing ship 43
Processing ship 44
Processing ship 45
Processing ship 46
Processing ship 47
Processing ship 48
Processing ship 49
Processing ship 50
Processin

## Missing values

In [102]:
# Missing segments 
mask_missing = df["SOG"].isna() | df["COG"] .isna()
bad_segments = df[mask_missing][["MMSI", "Trajectory"]].drop_duplicates()

# All unique segments
all_segments = df[["MMSI", "Trajectory"]].drop_duplicates()

# Clean segments = all_segments MINUS bad_segments
clean_segments = all_segments.merge(
    bad_segments,
    on=["MMSI", "Trajectory"],
    how="left",
    indicator=True
).query('_merge == "left_only"').drop(columns="_merge")

print(clean_segments)

clean_df = df.merge(clean_segments, on=["MMSI", "Trajectory"], how="inner")

          MMSI  Trajectory
0    207842750           0
1    209014000           1
2    209014000           2
3    209078000           3
4    209184000           4
..         ...         ...
861  636092635         861
862  636093117         862
863  636093288         863
864  636093318         864
865  636093318         865

[836 rows x 2 columns]


In [117]:
# Check how many segments were removed
print("Number of segments before cleaning:", all_segments.shape)
print("Number of segments after cleaning:", clean_segments.shape)
print("Number of removed segments:", bad_segments.shape)

Number of segments before cleaning: (866, 2)
Number of segments after cleaning: (836, 2)
Number of removed segments: (30, 2)


## Resampling and Imputation

In [None]:
df_resampled = clean_df.groupby(["MMSI", "Trajectory"]).apply(resample_segment)
df_resampled = df_resampled.reset_index(names=["MMSI", "Trajectory", "Timestamp"])

  df_resampled = clean_df.groupby(["MMSI", "Trajectory"]).apply(resample_segment)
  df_resampled = clean_df.groupby(["MMSI", "Trajectory"]).apply(resample_segment)


In [111]:
df_resampled.head()

Unnamed: 0,MMSI,Trajectory,Timestamp,UTM_x,UTM_y,SOG,v_east,v_north
0,207842750,0,2025-08-02 02:34:14,546363.385723,6172981.0,5.195884,-5.021178,-1.336032
1,207842750,0,2025-08-02 02:35:14,546054.417563,6172894.0,5.14588,-4.972856,-1.323175
2,207842750,0,2025-08-02 02:36:14,545746.424308,6172808.0,5.195884,-5.023085,-1.328845
3,207842750,0,2025-08-02 02:37:14,545447.154465,6172725.0,5.195884,-5.022404,-1.331419
4,207842750,0,2025-08-02 02:38:14,545147.878335,6172642.0,5.245888,-5.067139,-1.357736


In [None]:
# Quick check for missing values
df_resampled.isnull().sum()

MMSI          0
Trajectory    0
Timestamp     0
UTM_x         0
UTM_y         0
SOG           0
v_east        0
v_north       0
dtype: int64

## Degrees conversion

In [107]:
# Decompose COG into its vector components
cog_radians = np.radians(df_resampled['COG']) # convert to radians
df_resampled['v_east'] = df_resampled['SOG'] * np.sin(cog_radians) # eastward component
df_resampled['v_north'] = df_resampled['SOG'] * np.cos(cog_radians) # northward component
df_resampled.drop(columns=['COG'], inplace=True)

In [108]:
df_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,UTM_x,UTM_y,SOG,v_east,v_north
MMSI,Trajectory,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
207842750,0,2025-08-02 02:34:14,546363.385723,6172981.0,5.195884,-5.021178,-1.336032
207842750,0,2025-08-02 02:35:14,546054.417563,6172894.0,5.14588,-4.972856,-1.323175
207842750,0,2025-08-02 02:36:14,545746.424308,6172808.0,5.195884,-5.023085,-1.328845
207842750,0,2025-08-02 02:37:14,545447.154465,6172725.0,5.195884,-5.022404,-1.331419
207842750,0,2025-08-02 02:38:14,545147.878335,6172642.0,5.245888,-5.067139,-1.357736


In [109]:
df_resampled.shape

(517929, 5)

## Write to a final parquet file

In [None]:
save_parquet_partitioned(df_resampled, out_path=PARQUET_FILE_FINAL, partition_cols=["MMSI", "Trajectory"])

Saving to parquet dataset at data/ais_aug_2025_parquet_final ...
Parquet save done.
