In [1]:
import VRAE
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from itertools import product
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing as mp

from torch.utils.data import Sampler

To Do:
1. define parameters
2. import data
3. split data (data loaders) - remember to think about padding
4. standardize
5. train
6. visualisations :)

In [2]:
# df = pd.read_parquet('../../data/aisdk/processed/aisdk_2025')

In [3]:
df = pd.read_parquet('../../data/aisdk/processed/aisdk_2025')

# Convert Timestamp to datetime if it's not already
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Get the first date in the dataset
start_date = df['Timestamp'].min()

# Filter to only include the first 3 days
df = df[df['Timestamp'] < start_date + pd.Timedelta(days=5)]

print(f"Date range: {df['Timestamp'].min()} to {df['Timestamp'].max()}")
print(f"Total records: {len(df)}")

Date range: 2025-08-01 00:00:02 to 2025-08-06 00:00:01
Total records: 1141292


In [4]:
trajectories = []
    
for traj_id in df['Trajectory'].unique():
    traj_data = df[df['Trajectory'] == traj_id].sort_values('Timestamp')
    features = traj_data[['UTM_x', 'UTM_y', 'SOG', 'v_east', 'v_north']].values
    trajectories.append(features)

In [5]:
len(trajectories)

1702

In [6]:
train, val = train_test_split(trajectories, test_size=0.3, random_state=42) # Change back to whatever it was
# val, test = train_test_split(temp, test_size=0.7, random_state=42) # Change back to 0.5

In [7]:
train_stacked = np.vstack(train)
scaler = StandardScaler()
scaler.fit(train_stacked) 

train_s = [scaler.transform(traj) for traj in train]
val_s = [scaler.transform(traj) for traj in val]
# test_s = [scaler.transform(traj) for traj in test]

In [8]:
class TrajectoryDataset(Dataset):
    def __init__(self, trajectories):
        self.trajectories = trajectories
    
    def __len__(self):
        return len(self.trajectories)
    
    def __getitem__(self, idx):
        traj = torch.FloatTensor(self.trajectories[idx])
        return traj

In [9]:
def pad_trajectories(batch):
    lengths = torch.tensor([len(traj) for traj in batch])
    padded = pad_sequence(batch, batch_first=True, padding_value=0.0)

    lengths, perm_idx = lengths.sort(descending=True)
    padded = padded[perm_idx]

    return padded, lengths

In [10]:
train_dataset = TrajectoryDataset(train_s)
val_dataset = TrajectoryDataset(val_s)
# test_dataset = TrajectoryDataset(test_s)

In [11]:
batch_size = 128 # Change to 32 if computer can't hang

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad_trajectories
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size,
    shuffle=False,
    collate_fn=pad_trajectories
)

In [12]:
def train_VRAE(encoder_layers, decoder_layers, latent_dim, hidden_size, beta, learning_rate, batch_size):
    model = VRAE.VRAE(
        input_dim=feature_size,
        hidden_dim=hidden_size,
        latent_dim=latent_dim,
        num_layers_encoder=encoder_layers,
        num_layers_decoder=decoder_layers
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train_losses, val_losses = [], []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for batch_data, lengths in train_loader:
            batch_data, lengths = batch_data.to(device), lengths.to(device)
            reconstruction, mean, logvar = model(batch_data, lengths)
            loss, _, _ = VRAE.vae_loss(reconstruction, batch_data, mean, logvar, beta)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_data, lengths in val_loader:
                batch_data, lengths = batch_data.to(device), lengths.to(device)
                reconstruction, mean, logvar = model(batch_data, lengths)
                loss, _, _ = VRAE.vae_loss(reconstruction, batch_data, mean, logvar, beta)
                val_loss += loss.item()
        
        train_losses.append(train_loss / len(train_loader))
        val_losses.append(val_loss / len(val_loader))
        print(f"Epoch {epoch+1}/{num_epochs} - Train: {train_losses[-1]:.4f}, Val: {val_losses[-1]:.4f}")
    
    return model, train_losses, val_losses

In [13]:
# Parameters
encoder_layers = 3
decoder_layers = 2
num_epochs = 15

# Hyperparameters
latent_dimensions = [10, 20, 30]
hidden_size = [32, 50, 64]
# beta = [0.5, 1, 2]
# latent_dimensions = [20]
# hidden_size = [50]
beta = [1]

# Maybe Hyperparameters
learning_rate = 0.001
# batch_size = 128

feature_size = 5  # UTM_x, UTM_y, SOG, v_east, v_north

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [None]:
# Generate all combinations
configs = list(product(latent_dimensions, hidden_size, beta))
print(f"Total configurations to train: {len(configs)}")

# Train all configurations
results = []
for i, (latent_dim, hidden, b) in enumerate(configs):
    print(f"\n[{i+1}/{len(configs)}] Training: latent_dim={latent_dim}, hidden_size={hidden}, beta={b}")
    
    model, train_loss, val_loss = train_VRAE(
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers,
        latent_dim=latent_dim,
        hidden_size=hidden,
        beta=b,
        learning_rate=learning_rate,
        batch_size=batch_size
    )
    
    results.append({
        'latent_dim': latent_dim,
        'hidden_size': hidden,
        'beta': b,
        'train_losses': train_loss,
        'val_losses': val_loss,
        'final_val_loss': val_loss[-1],
        'model': model
    })

# Sort results by validation loss
results.sort(key=lambda x: x['final_val_loss'])
print("\n" + "="*60)
print("Best configurations:")
for i, r in enumerate(results[:3]):
    print(f"{i+1}. latent_dim={r['latent_dim']}, hidden_size={r['hidden_size']}, beta={r['beta']}, val_loss={r['final_val_loss']:.4f}")

Total configurations to train: 9

[1/9] Training: latent_dim=10, hidden_size=32, beta=1
Epoch 1/15 - Train: 0.2381, Val: 0.2233
Epoch 1/15 - Train: 0.2381, Val: 0.2233


In [None]:
# Plot training and validation losses
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(results[0]['train_losses'], label='Training Loss')
plt.plot(results[0]['val_losses'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('VRAE Training Progress')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Visualize all results
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot all training curves
for r in results:
    label = f"ld={r['latent_dim']}, hs={r['hidden_size']}, β={r['beta']}"
    axes[0].plot(r['train_losses'], alpha=0.5, label=label)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Training Loss')
axes[0].set_title('All Training Curves')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)

# Plot validation curves
for r in results:
    label = f"ld={r['latent_dim']}, hs={r['hidden_size']}, β={r['beta']}"
    axes[1].plot(r['val_losses'], alpha=0.5, label=label)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Validation Loss')
axes[1].set_title('All Validation Curves')
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)

plt.tight_layout()
plt.show()

# Get best model
best_model = results[0]['model']
print(f"\nBest model config: latent_dim={results[0]['latent_dim']}, hidden_size={results[0]['hidden_size']}, beta={results[0]['beta']}")

In [None]:
# Plot the latent space representation
best_model.eval()
latent_vectors = []
trajectory_labels = []

with torch.no_grad():
    for i, (batch_data, lengths) in enumerate(val_loader):
        batch_data, lengths = batch_data.to(device), lengths.to(device)
        
        # Get latent representations (mean of the latent distribution)
        _, mean, _ = best_model(batch_data, lengths)
        latent_vectors.append(mean.cpu().numpy())
        trajectory_labels.extend([i] * len(mean))

# Concatenate all latent vectors
latent_vectors = np.vstack(latent_vectors)
print(f"Latent space shape: {latent_vectors.shape}")

# Visualize latent space
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# PCA visualization
if latent_vectors.shape[1] > 2:
    pca = PCA(n_components=2)
    latent_pca = pca.fit_transform(latent_vectors)
    axes[0].scatter(latent_pca[:, 0], latent_pca[:, 1], alpha=0.5, s=10)
    axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    axes[0].set_title('Latent Space - PCA Projection')
    axes[0].grid(True, alpha=0.3)
else:
    latent_pca = latent_vectors
    axes[0].scatter(latent_pca[:, 0], latent_pca[:, 1], alpha=0.5, s=10)
    axes[0].set_xlabel('Latent Dimension 1')
    axes[0].set_ylabel('Latent Dimension 2')
    axes[0].set_title('Latent Space - 2D')
    axes[0].grid(True, alpha=0.3)

# t-SNE visualization
if len(latent_vectors) > 30:  # t-SNE needs sufficient samples
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(latent_vectors)-1))
    latent_tsne = tsne.fit_transform(latent_vectors)
    axes[1].scatter(latent_tsne[:, 0], latent_tsne[:, 1], alpha=0.5, s=10)
    axes[1].set_xlabel('t-SNE Dimension 1')
    axes[1].set_ylabel('t-SNE Dimension 2')
    axes[1].set_title('Latent Space - t-SNE Projection')
    axes[1].grid(True, alpha=0.3)
else:
    axes[1].text(0.5, 0.5, 'Not enough samples for t-SNE', 
                 ha='center', va='center', transform=axes[1].transAxes)
    axes[1].set_title('t-SNE (insufficient data)')

plt.tight_layout()
plt.show()

print(f"\nTotal trajectories visualized: {len(latent_vectors)}")
print(f"Latent dimension: {results[0]['latent_dim']}")