In [132]:
import torch
import RAE
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import hdbscan
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import plotly.express as px
from pyproj import Transformer

import pyarrow
import pyarrow.parquet

import warnings
warnings.filterwarnings("ignore")


In [121]:
def visualize_trajectories(df, color_by='MMSI', title=None, zoom=5, height=800):
    
    transformer = Transformer.from_crs("EPSG:25832", "EPSG:4326", always_xy=True)
    lon, lat = transformer.transform(df['UTM_x'].values, df['UTM_y'].values)
    
    if 'Trajectory' in df.columns:
        vis_cols = ['Trajectory', 'MMSI', 'Timestamp', 'UTM_x', 'UTM_y']
    else:
        vis_cols = ['MMSI', 'Timestamp', 'UTM_x', 'UTM_y']

    if color_by not in vis_cols:
        vis_cols.append(color_by)

    if 'SOG' in df.columns and 'SOG' not in vis_cols:
        vis_cols.append('SOG')
    
    vis_df = df[vis_cols].copy()
    vis_df['Longitude'] = lon
    vis_df['Latitude'] = lat
    
    # Generate title if not provided
    if title is None:
        date_min = vis_df['Timestamp'].min().date()
        date_max = vis_df['Timestamp'].max().date()
        title = f"Ship Trajectories - {date_min} to {date_max}"
    
    # Create visualization with trajectories using lat/lon on a map
    fig = px.line_map(
        vis_df.sort_values('Timestamp'),
        lat="Latitude",
        lon="Longitude",
        color=color_by,
        line_group="Trajectory",
        hover_data=["MMSI", "Timestamp", "SOG"] if "SOG" in vis_df.columns else ["MMSI", "Timestamp"],
        zoom=zoom,
        title=title
    )
    
    fig.update_layout(
        mapbox_style="open-street-map",
        showlegend=False,  # Hide legend since there can be many trajectories
        height=height
    )
    
    print(f"âœ“ Visualization complete - colored by '{color_by}'")
    
    return fig

In [122]:
feature_size = 5
hidden_size = 64
latent_dim = 12
encoder_layers = 3
decoder_layers = 2
dropout = 0.2
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [123]:
obj = joblib.load(os.path.join("..", "..", "data", "aisdk", "scaler", "scaler_aisdk_2025.pkl"))

saved_feature_names = None

if isinstance(obj, StandardScaler):
    scaler = obj
    # If you stored feature names separately, keep as None here
    print("Loaded StandardScaler object from joblib.")
elif isinstance(obj, dict):
    scaler = StandardScaler()
    scaler.mean_ = np.asarray(obj['mean'])
    scaler.scale_ = np.asarray(obj['scale'])
    saved_feature_names = obj.get('feature_names', None)
    print("Reconstructed StandardScaler from dict params.")
else:
    raise TypeError(f"Unexpected scaler payload type: {type(obj)}")

print("Scaler ready. n_features:", scaler.mean_.shape[0],
      "| feature names:", (list(saved_feature_names) if saved_feature_names is not None else "None"))

Loaded StandardScaler object from joblib.
Scaler ready. n_features: 5 | feature names: None


In [124]:
# Un-scale train and val DataFrames for visualization
def inverse_scale_df(df, scaler, cols, saved_feature_names=None):
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return df.copy(), []
    Xs = df[cols].to_numpy()
    mean = np.asarray(scaler.mean_)
    scale = np.asarray(scaler.scale_)
    # Align mean/scale to column order if saved feature names exist
    if saved_feature_names is not None:
        name_to_idx = {name: i for i, name in enumerate(saved_feature_names)}
        idx = []
        for c in cols:
            if c in name_to_idx:
                idx.append(name_to_idx[c])
            else:
                # Fallback: assume same order for missing names
                idx.append(cols.index(c))
        mean_aligned = mean[idx]
        scale_aligned = scale[idx]
    else:
        # Assume the first N features correspond to the selected columns
        mean_aligned = mean[:len(cols)]
        scale_aligned = scale[:len(cols)]
    X = Xs * scale_aligned + mean_aligned
    out = df.copy()
    out.loc[:, cols] = X
    return out, cols

In [None]:
# Load the checkpoint
checkpoint = torch.load('../../checkpoints/rae/best_rae_model.pth')

# Recreate the model architecture
model = RAE.RecurrentAutoencoder(
    input_dim=feature_size,
    hidden_dim=hidden_size,
    latent_dim=latent_dim,
    num_layers_encoder=encoder_layers,
    num_layers_decoder=decoder_layers,
    dropout=dropout
    ).to(device)

# Load the trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # Set to evaluation mode

print(f"Model loaded! Best val_loss: {checkpoint['val_loss']}")

Model loaded! Best val_loss: 1.5638
Trained for 30 epochs


In [126]:
train = pd.read_parquet('../../data/aisdk/train/aisdk_2025')
val = pd.read_parquet('../../data/aisdk/val/aisdk_2025')

In [127]:
train_trajectories = []
    
for traj_id in train['Trajectory'].unique():
    traj_data = train[train['Trajectory'] == traj_id].sort_values('Timestamp')
    features = traj_data[['UTM_x', 'UTM_y', 'SOG', 'v_east', 'v_north']].values
    train_trajectories.append(features)

print(f"Total training trajectories: {len(train_trajectories)}")

val_trajectories = []
    
for traj_id in val['Trajectory'].unique():
    traj_data = val[val['Trajectory'] == traj_id].sort_values('Timestamp')
    features = traj_data[['UTM_x', 'UTM_y', 'SOG', 'v_east', 'v_north']].values
    val_trajectories.append(features)

print(f"Total validationtrajectories: {len(val_trajectories)}")

Total training trajectories: 2160
Total validationtrajectories: 463


In [128]:
class TrajectoryDataset(Dataset):
    def __init__(self, trajectories):
        self.trajectories = trajectories
    
    def __len__(self):
        return len(self.trajectories)
    
    def __getitem__(self, idx):
        traj = torch.FloatTensor(self.trajectories[idx])
        return traj

In [129]:
train_dataset = TrajectoryDataset(train_trajectories)
val_dataset = TrajectoryDataset(val_trajectories)

In [130]:
def pad_trajectories(batch):
    lengths = torch.tensor([len(traj) for traj in batch])
    padded = pad_sequence(batch, batch_first=True, padding_value=0.0)

    lengths, perm_idx = lengths.sort(descending=True)
    padded = padded[perm_idx]

    return padded, lengths

In [131]:
encode_loader_t = DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=False,
    collate_fn=pad_trajectories
)

model.eval()
all_train_latents = []
with torch.no_grad():
    for batch, lengths in encode_loader_t:
        batch = batch.to(device)
        lengths = lengths.to(device)
        z = model.encode(batch, lengths)  # [batch_size, latent_dim]
        all_train_latents.append(z.cpu())

train_latents = torch.cat(all_train_latents, dim=0).numpy()  # shape: [N_trajectories, latent_dim]
print("Encoded train latents:", train_latents.shape)

encode_loader_v = DataLoader(
    val_dataset,
    batch_size=256,
    shuffle=False,
    collate_fn=pad_trajectories
)

model.eval()
all_val_latents = []
with torch.no_grad():
    for batch, lengths in encode_loader_v:
        batch = batch.to(device)
        lengths = lengths.to(device)
        z = model.encode(batch, lengths)  # [batch_size, latent_dim]
        all_val_latents.append(z.cpu())
        
val_latents = torch.cat(all_val_latents, dim=0).numpy()  # shape: [N_trajectories, latent_dim]
print("Encoded val latents:", val_latents.shape)

Encoded train latents: (2160, 12)
Encoded val latents: (463, 12)
Encoded val latents: (463, 12)


In [147]:
min_clusters = 50
min_samples = 10
metric = 'euclidean'
csm = 'eom'

clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_clusters,   
        min_samples=min_samples,   
        metric=metric,
        cluster_selection_method=csm,
        prediction_data=True,
    )

t_labels = clusterer.fit_predict(train_latents)

# Create df with trajectory and cluster labels
traj_ids = list(train['Trajectory'].unique())
train_labels_df = pd.DataFrame({
    'Trajectory': traj_ids,
    'ClusterLabel': t_labels
})

In [149]:
train_labelled = train.merge(train_labels_df, on='Trajectory', how='left')

In [150]:
v_labels, v_probs = hdbscan.approximate_predict(clusterer, val_latents)

traj_ids = list(val['Trajectory'].unique())

# Sanity check: lengths should match
assert len(traj_ids) == len(v_labels), f"Mismatch: {len(traj_ids)} traj_ids vs {len(v_labels)} labels"

val_labels_df = pd.DataFrame({
    'Trajectory': traj_ids,
    'ClusterLabel': v_labels,
    'ClusterProb': v_probs
})

In [152]:
val_labelled = val.merge(val_labels_df, on='Trajectory', how='left')

In [155]:
TRAIN_PARQUET = os.path.join("..", "..", "data", "aisdk", "clustered_train", "aisdk_2025")
VAL_PARQUET = os.path.join("..", "..", "data", "aisdk", "clustered_val", "aisdk_2025")

def save_parquet_partitioned(df: pd.DataFrame,
                             out_path: str,
                             partition_cols: list[str] = ["MMSI", "Segment"]) -> None:
    """Save DataFrame as a partitioned Parquet dataset."""
    print(f"Saving to parquet dataset at {out_path} ...")
    table = pyarrow.Table.from_pandas(df, preserve_index=False)
    pyarrow.parquet.write_to_dataset(
        table,
        root_path=out_path,
        partition_cols=partition_cols,
    )
    print("Parquet save done.")

In [156]:
save_parquet_partitioned(train_labelled, out_path=TRAIN_PARQUET, partition_cols=["MMSI", "Trajectory"])
save_parquet_partitioned(val_labelled, out_path=VAL_PARQUET, partition_cols=["MMSI", "Trajectory"])

Saving to parquet dataset at ../../data/aisdk/clustered_train/aisdk_2025 ...
Parquet save done.
Saving to parquet dataset at ../../data/aisdk/clustered_val/aisdk_2025 ...
Parquet save done.
Parquet save done.
Saving to parquet dataset at ../../data/aisdk/clustered_val/aisdk_2025 ...
Parquet save done.
