In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from geopy.distance import geodesic
import importlib
import utm

from ais_to_parquet import fn
from veda.trajectory_segmentation import calculate_max_radius, segment_by_stationary_periods
import veda.interpolation as interpolation
from veda.interpolation import regularize_trajectory, regularize_all_trajectories

# Download data

I adjusted the `ais-to-parquet.py` script to also extract ship type. We may consider also including:
- ROT
- Heading
- Destination
- ETA

In [2]:
# fn('data/ais_data/aisdk-2025-11-01.csv', 'data/ais_data/aisdk-2025-11-01.parquet')
# fn('data/ais_data/aisdk-2025-11-02.csv', 'data/ais_data/aisdk-2025-11-02.parquet')
# fn('data/ais_data/aisdk-2025-11-03.csv', 'data/ais_data/aisdk-2025-11-03.parquet')
# fn('data/ais_data/aisdk-2025-11-04.csv', 'data/ais_data/aisdk-2025-11-04.parquet')
# fn('data/ais_data/aisdk-2025-11-05.csv', 'data/ais_data/aisdk-2025-11-05.parquet')
# fn('data/ais_data/aisdk-2025-11-06.csv', 'data/ais_data/aisdk-2025-11-06.parquet')
# fn('data/ais_data/aisdk-2025-11-07.csv', 'data/ais_data/aisdk-2025-11-07.parquet')

In [3]:
# Combine all parquet files into a single df
parquet_files = [
    'data/ais_data/aisdk-2025-11-01.parquet',
    'data/ais_data/aisdk-2025-11-02.parquet'#,
    # 'data/ais_data/aisdk-2025-11-03.parquet',
    # 'data/ais_data/aisdk-2025-11-04.parquet',
    # 'data/ais_data/aisdk-2025-11-05.parquet',
    # 'data/ais_data/aisdk-2025-11-06.parquet',
    # 'data/ais_data/aisdk-2025-11-07.parquet'
]
dfs = [pd.read_parquet(file) for file in parquet_files]
df = pd.concat(dfs, ignore_index=True)

In [4]:
print(df.shape)
df.head()

(11957681, 8)


Unnamed: 0,Timestamp,Latitude,Longitude,SOG,COG,Ship type,MMSI,Segment
0,2025-11-01 00:00:54,54.638343,11.375378,0.0,340.8,Undefined,205196000,0
1,2025-11-01 00:03:48,54.638343,11.375378,0.0,340.8,Other,205196000,0
2,2025-11-01 00:03:54,54.638335,11.375375,0.0,340.8,Other,205196000,0
3,2025-11-01 00:06:54,54.638353,11.375373,0.0,340.8,Other,205196000,0
4,2025-11-01 00:09:48,54.638353,11.375373,0.0,340.8,Other,205196000,0


# Preprocessing

In [5]:
print("Total vessels: ", df['MMSI'].unique().shape[0])

Total vessels:  1608


In [6]:
cdf = df[df['Ship type'] == 'Cargo'].drop(columns=['Ship type'])
print("Cargo vessels: ", cdf['MMSI'].unique().shape[0])

Cargo vessels:  562


## Missing Values

Here I check to see where we have missing values. Since it seems that there are only a few vessels missing SOG and COG values, we could either calculate them manually based on lat/long and time, or we could just drop the affected vessels.

In [7]:
cdf.isnull().sum()

Timestamp       0
Latitude        0
Longitude       0
SOG           295
COG          4447
MMSI            0
Segment         0
dtype: int64

In [8]:
m_cog = cdf[cdf['COG'].isnull()]['MMSI'].unique()
m_sog = cdf[cdf['SOG'].isnull()]['MMSI'].unique()

m_missing = set(m_cog).union(set(m_sog))
print("Number of ships with missing COG or SOG: ", len(m_missing))

Number of ships with missing COG or SOG:  27


Here we need to decide whether we want to drop the ships with missing COG/SOG values, or whether we want to estimate them using positional and temporal data.

## Trajectory Segmentation Based on Stationary Periods

Split trajectories whenever the ship is stationary for more than 30 minutes. A ship is considered stationary if:
- SOG < 1 knot (already in m/s after conversion), OR
- Position variance < 50m over the stationary period

In [9]:
# Remove vessels with missing SOG/COG
cdf_clean = cdf[~cdf['MMSI'].isin(m_missing)].copy()

print(f"Applying trajectory segmentation...")
print(f"Original shape: {cdf_clean.shape}")
print(f"Number of vessels: {cdf_clean['MMSI'].nunique()}")

Applying trajectory segmentation...
Original shape: (3549302, 7)
Number of vessels: 535


In [10]:
# Segment trajectories
cdf_segmented = segment_by_stationary_periods(
    cdf_clean,
    sog_threshold=0.5,  # 1 knot in m/s
    position_threshold=50,    # 50 meters
    time_threshold=30         # 30 minutes
)

print(f"\nAfter segmentation:")
print(f"Total trajectories: {cdf_segmented['Trajectory'].nunique()}")
print(f"Average trajectories per vessel: {cdf_segmented['Trajectory'].nunique() / cdf_segmented['MMSI'].nunique():.2f}")


After segmentation:
Total trajectories: 582
Average trajectories per vessel: 1.10


In [11]:
# Examine trajectory statistics
trajectory_stats = cdf_segmented.groupby('Trajectory').agg({
    'MMSI': 'first',
    'Timestamp': ['min', 'max', 'count'],
    'SOG': ['mean', 'max'],
    'Latitude': 'count'
}).reset_index()

trajectory_stats.columns = ['Trajectory', 'MMSI', 'Start_Time', 'End_Time', 'Num_Points', 'Avg_SOG', 'Max_SOG', 'Count']
trajectory_stats['Duration_Hours'] = (trajectory_stats['End_Time'] - trajectory_stats['Start_Time']).dt.total_seconds() / 3600

print("Trajectory Statistics:")
print(f"Min points per trajectory: {trajectory_stats['Num_Points'].min()}")
print(f"Max points per trajectory: {trajectory_stats['Num_Points'].max()}")
print(f"Mean points per trajectory: {trajectory_stats['Num_Points'].mean():.2f}")
print(f"\nMin duration (hours): {trajectory_stats['Duration_Hours'].min():.2f}")
print(f"Max duration (hours): {trajectory_stats['Duration_Hours'].max():.2f}")
print(f"Mean duration (hours): {trajectory_stats['Duration_Hours'].mean():.2f}")

trajectory_stats.head(10)

Trajectory Statistics:
Min points per trajectory: 38
Max points per trajectory: 26592
Mean points per trajectory: 5950.73

Min duration (hours): 0.39
Max duration (hours): 47.98
Mean duration (hours): 17.21


Unnamed: 0,Trajectory,MMSI,Start_Time,End_Time,Num_Points,Avg_SOG,Max_SOG,Count,Duration_Hours
0,0,205453000,2025-11-02 03:38:07,2025-11-02 07:47:34,2215,6.446854,8.591215,2215,4.1575
1,1,205453000,2025-11-02 11:08:06,2025-11-02 15:01:49,1980,7.148199,8.694104,1980,3.895278
2,2,205465000,2025-11-02 16:39:50,2025-11-02 23:59:52,2943,4.259572,4.990107,2943,7.333889
3,3,209014000,2025-11-01 01:14:29,2025-11-02 07:02:50,10736,3.691677,5.555995,10736,29.805833
4,4,209190000,2025-11-02 01:48:29,2025-11-02 07:26:13,3133,7.351065,7.665216,3133,5.628889
5,5,209276000,2025-11-01 11:22:34,2025-11-02 09:10:37,11025,7.694304,8.642659,11025,21.800833
6,6,209325000,2025-11-01 00:01:28,2025-11-02 09:26:57,10046,4.698723,6.327661,10046,33.424722
7,7,209336000,2025-11-01 13:54:03,2025-11-01 21:52:25,2739,4.238068,5.555995,2739,7.972778
8,8,209525000,2025-11-01 01:32:11,2025-11-02 11:09:38,16059,4.334073,5.96755,16059,33.624167
9,9,209525000,2025-11-02 23:36:40,2025-11-02 23:59:54,207,0.840756,3.39533,207,0.387222


In [12]:
# Visualize a ship with multiple trajectories to verify segmentation
# Find a ship with multiple trajectories
ships_with_multiple_trajectories = cdf_segmented.groupby('MMSI')['Trajectory'].nunique()
multi_traj_ships = ships_with_multiple_trajectories[ships_with_multiple_trajectories > 1]

if len(multi_traj_ships) > 0:
    sample_mmsi = multi_traj_ships.index[3]
    sample_ship_df = cdf_segmented[cdf_segmented['MMSI'] == sample_mmsi].sort_values('Timestamp')
    
    print(f"Visualizing MMSI {sample_mmsi} with {sample_ship_df['Trajectory'].nunique()} trajectories")
    
    # Create color map for trajectories
    fig = px.line_map(
        sample_ship_df,
        lat="Latitude",
        lon="Longitude",
        color="Trajectory",
        hover_data=["Timestamp", "SOG"],
        zoom=5,
        title=f"Segmented Trajectories for MMSI {sample_mmsi}"
    )
    fig.update_layout(mapbox_style="open-street-map")
    fig.show()
else:
    print("No ships with multiple trajectories found")

Visualizing MMSI 211833390 with 2 trajectories


## Time Series Regularization

Resample trajectories to regular time intervals using linear interpolation. This ensures consistent sampling frequency for RNN training.

In [None]:
importlib.reload(interpolation)
from veda.interpolation import regularize_all_trajectories

In [29]:
# Choose interval (e.g., 5 minutes is common for AIS data)
INTERVAL_MINUTES = 5

print(f"Before regularization: {len(cdf_segmented)} data points")

cdf_regular = regularize_all_trajectories(cdf_segmented, interval_minutes=INTERVAL_MINUTES)

print(f"After regularization: {len(cdf_regular)} data points")
print(f"Time interval: {INTERVAL_MINUTES} minutes")

Before regularization: 3463323 data points
Regularizing 582 trajectories...
  Processed 100/582 trajectories
  Processed 200/582 trajectories
  Processed 100/582 trajectories
  Processed 200/582 trajectories
  Processed 300/582 trajectories
  Processed 400/582 trajectories
  Processed 300/582 trajectories
  Processed 400/582 trajectories
  Processed 500/582 trajectories
  Completed all 582 trajectories
After regularization: 120479 data points
Time interval: 5 minutes
  Processed 500/582 trajectories
  Completed all 582 trajectories
After regularization: 120479 data points
Time interval: 5 minutes


In [30]:
# Check time intervals are now regular
sample_traj = cdf_regular[cdf_regular['Trajectory'] == 0].sort_values('Timestamp')
time_diffs = sample_traj['Timestamp'].diff().dt.total_seconds() / 60

print("Sample trajectory time intervals (minutes):")
print(time_diffs.value_counts().head())
print(f"\nMean interval: {time_diffs.mean():.2f} minutes")
print(f"Std interval: {time_diffs.std():.4f} minutes")

Sample trajectory time intervals (minutes):
Timestamp
5.0    49
Name: count, dtype: int64

Mean interval: 5.00 minutes
Std interval: 0.0000 minutes


In [31]:
# Visualize before and after regularization for comparison
if len(multi_traj_ships) > 0:
    # Pick a trajectory from the ship we visualized earlier
    sample_mmsi = multi_traj_ships.index[0]
    sample_traj_id = cdf_segmented[cdf_segmented['MMSI'] == sample_mmsi]['Trajectory'].iloc[0]
    
    # Original (irregular) data
    original = cdf_segmented[cdf_segmented['Trajectory'] == sample_traj_id].sort_values('Timestamp')
    
    # Regularized data
    regularized = cdf_regular[cdf_regular['Trajectory'] == sample_traj_id].sort_values('Timestamp')
    
    print(f"Trajectory {sample_traj_id} (MMSI {sample_mmsi}):")
    print(f"  Original: {len(original)} points")
    print(f"  Regularized: {len(regularized)} points")
    
    # Create comparison plot
    import plotly.graph_objects as go
    
    fig = go.Figure()
    
    # Original data
    fig.add_trace(go.Scattermap(
        lat=original['Latitude'],
        lon=original['Longitude'],
        mode='markers+lines',
        name='Original (irregular)',
        marker=dict(size=6, color='blue'),
        line=dict(width=2, color='blue')
    ))
    
    # Regularized data
    fig.add_trace(go.Scattermap(
        lat=regularized['Latitude'],
        lon=regularized['Longitude'],
        mode='markers+lines',
        name=f'Regularized ({INTERVAL_MINUTES}min)',
        marker=dict(size=6, color='red'),
        line=dict(width=2, color='red')
    ))
    
    fig.update_layout(
        mapbox_style="open-street-map",
        mapbox=dict(
            center=dict(
                lat=original['Latitude'].mean(),
                lon=original['Longitude'].mean()
            ),
            zoom=8
        ),
        title=f"Comparison: Original vs Regularized Trajectory {sample_traj_id}",
        height=600
    )
    
    fig.show()

Trajectory 0 (MMSI 205453000):
  Original: 2215 points
  Regularized: 50 points


In [32]:
# Summary statistics of regularized data
print("Regularized Dataset Summary:")
print(f"Total data points: {len(cdf_regular)}")
print(f"Total trajectories: {cdf_regular['Trajectory'].nunique()}")
print(f"Total vessels: {cdf_regular['MMSI'].nunique()}")

# Trajectory length distribution
traj_lengths = cdf_regular.groupby('Trajectory').size()
print(f"\nTrajectory lengths (number of time steps):")
print(f"  Min: {traj_lengths.min()}")
print(f"  Max: {traj_lengths.max()}")
print(f"  Mean: {traj_lengths.mean():.2f}")
print(f"  Median: {traj_lengths.median():.2f}")

cdf_regular.head()

Regularized Dataset Summary:
Total data points: 120479
Total trajectories: 582
Total vessels: 530

Trajectory lengths (number of time steps):
  Min: 5
  Max: 576
  Mean: 207.01
  Median: 168.50


Unnamed: 0,Timestamp,Latitude,Longitude,SOG,COG,MMSI,Segment,Trajectory
0,2025-11-02 03:38:07,55.066067,7.370117,7.71666,53.7,205453000.0,0.0,0.0
1,2025-11-02 03:43:07,55.0784,7.398958,7.742382,52.9,205453000.0,,0.0
2,2025-11-02 03:48:07,55.090975,7.428625,7.690938,53.8,205453000.0,,0.0
3,2025-11-02 03:53:07,55.103375,7.457942,7.613771,52.95,205453000.0,,0.0
4,2025-11-02 03:58:07,55.115492,7.486558,7.613771,53.7,205453000.0,,0.0


## Convert to UTM Coordinates

Convert latitude/longitude to UTM (Universal Transverse Mercator) coordinates. UTM provides planar coordinates in meters, which is better for:
- Distance calculations
- Velocity computations
- Neural network training (avoids spherical coordinate issues)

In [34]:
def lat_lon_to_utm(df):
    """
    Convert latitude and longitude to UTM coordinates.
    
    Parameters:
    - df: DataFrame with 'Latitude' and 'Longitude' columns
    
    Returns:
    - DataFrame with added 'UTM_x', 'UTM_y', 'UTM_zone', 'UTM_letter' columns
    """
    df = df.copy()
    
    # Convert each point to UTM
    utm_coords = df.apply(
        lambda row: utm.from_latlon(row['Latitude'], row['Longitude']),
        axis=1
    )
    
    # Extract UTM components
    df['UTM_x'] = utm_coords.apply(lambda x: x[0])  # Easting (meters)
    df['UTM_y'] = utm_coords.apply(lambda x: x[1])  # Northing (meters)
    df['UTM_zone'] = utm_coords.apply(lambda x: x[2])  # Zone number
    df['UTM_letter'] = utm_coords.apply(lambda x: x[3])  # Zone letter
    
    return df

print("Converting to UTM coordinates...")
cdf_utm = lat_lon_to_utm(cdf_regular)

print(f"Conversion complete!")
print(f"\nUTM zones in data:")
print(cdf_utm.groupby(['UTM_zone', 'UTM_letter']).size())

cdf_utm.head()

Converting to UTM coordinates...
Conversion complete!

UTM zones in data:
UTM_zone  UTM_letter
31        U              2129
32        U             24860
          V             45995
33        U             41138
          V              6357
dtype: int64


Unnamed: 0,Timestamp,Latitude,Longitude,SOG,COG,MMSI,Segment,Trajectory,UTM_x,UTM_y,UTM_zone,UTM_letter
0,2025-11-02 03:38:07,55.066067,7.370117,7.71666,53.7,205453000.0,0.0,0.0,395914.986518,6103357.0,32,U
1,2025-11-02 03:43:07,55.0784,7.398958,7.742382,52.9,205453000.0,,0.0,397788.064949,6104687.0,32,U
2,2025-11-02 03:48:07,55.090975,7.428625,7.690938,53.8,205453000.0,,0.0,399713.341389,6106043.0,32,U
3,2025-11-02 03:53:07,55.103375,7.457942,7.613771,52.95,205453000.0,,0.0,401614.684729,6107381.0,32,U
4,2025-11-02 03:58:07,55.115492,7.486558,7.613771,53.7,205453000.0,,0.0,403469.525097,6108689.0,32,U


In [35]:
# Check UTM coordinate ranges
print("UTM Coordinate Statistics:")
print(f"\nUTM_x (Easting):")
print(f"  Min: {cdf_utm['UTM_x'].min():.2f} m")
print(f"  Max: {cdf_utm['UTM_x'].max():.2f} m")
print(f"  Range: {cdf_utm['UTM_x'].max() - cdf_utm['UTM_x'].min():.2f} m")

print(f"\nUTM_y (Northing):")
print(f"  Min: {cdf_utm['UTM_y'].min():.2f} m")
print(f"  Max: {cdf_utm['UTM_y'].max():.2f} m")
print(f"  Range: {cdf_utm['UTM_y'].max() - cdf_utm['UTM_y'].min():.2f} m")

UTM Coordinate Statistics:

UTM_x (Easting):
  Min: 152950.88 m
  Max: 695387.88 m
  Range: 542437.00 m

UTM_y (Northing):
  Min: 5935455.03 m
  Max: 6541977.93 m
  Range: 606522.90 m


In [36]:
# Visualize a trajectory in both coordinate systems
sample_traj_id = cdf_utm['Trajectory'].iloc[100]
sample_traj = cdf_utm[cdf_utm['Trajectory'] == sample_traj_id].sort_values('Timestamp')

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Lat/Lon Coordinates', 'UTM Coordinates (meters)'),
    specs=[[{'type': 'scattermap'}, {'type': 'scatter'}]]
)

# Lat/Lon plot
fig.add_trace(
    go.Scattermap(
        lat=sample_traj['Latitude'],
        lon=sample_traj['Longitude'],
        mode='markers+lines',
        marker=dict(size=8, color='blue'),
        line=dict(width=2, color='blue'),
        name='Lat/Lon'
    ),
    row=1, col=1
)

# UTM plot
fig.add_trace(
    go.Scatter(
        x=sample_traj['UTM_x'],
        y=sample_traj['UTM_y'],
        mode='markers+lines',
        marker=dict(size=8, color='red'),
        line=dict(width=2, color='red'),
        name='UTM'
    ),
    row=1, col=2
)

fig.update_layout(
    mapbox=dict(
        style="open-street-map",
        center=dict(
            lat=sample_traj['Latitude'].mean(),
            lon=sample_traj['Longitude'].mean()
        ),
        zoom=8
    ),
    xaxis=dict(title='UTM Easting (m)', scaleanchor='y', scaleratio=1),
    yaxis=dict(title='UTM Northing (m)'),
    height=500,
    title=f"Trajectory {sample_traj_id} Comparison"
)

fig.show()

print(f"Trajectory {sample_traj_id}:")
print(f"  Points: {len(sample_traj)}")
print(f"  UTM Zone: {sample_traj['UTM_zone'].iloc[0]}{sample_traj['UTM_letter'].iloc[0]}")

Trajectory 2.0:
  Points: 89
  UTM Zone: 32U


In [51]:
cdf_utm.head()

Unnamed: 0,Timestamp,Latitude,Longitude,SOG,COG,MMSI,Segment,Trajectory,UTM_x,UTM_y,UTM_zone,UTM_letter
0,2025-11-02 03:38:07,55.066067,7.370117,7.71666,53.7,205453000.0,0.0,0.0,395914.986518,6103357.0,32,U
1,2025-11-02 03:43:07,55.0784,7.398958,7.742382,52.9,205453000.0,,0.0,397788.064949,6104687.0,32,U
2,2025-11-02 03:48:07,55.090975,7.428625,7.690938,53.8,205453000.0,,0.0,399713.341389,6106043.0,32,U
3,2025-11-02 03:53:07,55.103375,7.457942,7.613771,52.95,205453000.0,,0.0,401614.684729,6107381.0,32,U
4,2025-11-02 03:58:07,55.115492,7.486558,7.613771,53.7,205453000.0,,0.0,403469.525097,6108689.0,32,U
