In [27]:
import pandas as pd
import numpy as np
import os
import random

import torch
import torch.nn as nn
import wandb
import os
import yaml

from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from classification_rnn import ClassificationRNN, DEVICE
from seed import set_seed
from config import flatten_config

In [28]:
BASE = "../../data/aisdk/processed"


# 1 - Create the windows

In [29]:
# TODO: ADD A CLASSIFIER HERE!!
def _get_cluster_id_for_segment(g):
    """
    Assign cluster to a given segment
    PLACEHOLDER FOR NOW
    """
    return random.randint(0, 9)

In [30]:
def make_past_future_windows_np(
    past_len=30,
    future_len=30,
    step=1,
    input_path="data/aisdk/processed/train_trajectories.npz",
    output_path="data/aisdk/processed/windows/train_trajectories.npz"
):
    """
    Load standardized, sorted trajectory data from a parquet file,
    create past/future sliding windows, and save everything as NumPy arrays
    in a single .npz file:

        - past:   (N, past_min, num_features)
        - future: (N, future_min, num_features)
        - cluster: (N,)
        - mmsi:   (N,)
        - traj:   (N,)

    Assumes input parquet is already sorted by [MMSI, Trajectory, Timestamp].
    """
    
    print(f"Loading trajectories from {input_path} ...")
    data = np.load(input_path, allow_pickle=True)
    trajs = data["trajectories"]  # object array, each traj[i] is (T_i, F)
    num_traj = len(trajs)
    print(f"  → Found {num_traj} trajectories")

    total_len = past_len + future_len

    past_list = []
    future_list = []
    cluster_list = []

    for i, traj in enumerate(trajs):
        # traj: (T_i, F)
        T = traj.shape[0]
        if T < total_len:
            continue  # too short for one window

        # number of windows for this trajectory (with stride `step`)
        num_windows = (T - total_len) // step + 1

        cid = _get_cluster_id_for_segment(traj)
        cluster_list.append(cid)

        for w in range(num_windows):
            start = w * step
            mid   = start + past_len
            end   = mid + future_len

            past_window = traj[start:mid]   # (past_len, F)
            future_window = traj[mid:end]   # (future_len, F)

            past_list.append(past_window)
            future_list.append(future_window)

        if (i + 1) % 50 == 0:
            print(f"  → Processed {i+1}/{num_traj} trajectories")

    if not past_list:
        raise RuntimeError("No windows generated. "
                           "Check past_len, future_len, and trajectory lengths.")

    print("Stacking windows into numpy arrays...")
    past = np.stack(past_list)     # (N, past_len, F)
    future = np.stack(future_list) # (N, future_len, F)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    print(f"Saving windows to {output_path} ...")
    np.savez_compressed(
        output_path,
        past=past,
        future=future,
        cluster=cluster_list,
        past_len=past_len,
        future_len=future_len,
        step=step,
    )

    print("DONE!")
    print(f"  → Total windows: {past.shape[0]}")



In [31]:
make_past_future_windows_np(
    input_path=os.path.join(BASE, "train_trajectories.npz"),
    output_path=os.path.join(BASE, "windows/train_trajectories.npz"),
)

make_past_future_windows_np(
    input_path  = os.path.join(BASE, "val_trajectories.npz"),
    output_path = os.path.join(BASE, "windows/val_trajectories.npz"),
)

make_past_future_windows_np(
    input_path  = os.path.join(BASE, "test_trajectories.npz"),
    output_path = os.path.join(BASE, "windows/test_trajectories.npz")
)

Loading trajectories from ../../data/aisdk/processed/train_trajectories.npz ...
  → Found 359 trajectories
  → Processed 50/359 trajectories
  → Processed 100/359 trajectories
  → Processed 150/359 trajectories
  → Processed 200/359 trajectories
  → Processed 250/359 trajectories
  → Processed 300/359 trajectories
  → Processed 350/359 trajectories
Stacking windows into numpy arrays...
Saving windows to ../../data/aisdk/processed/windows/train_trajectories.npz ...
DONE!
  → Total windows: 171461
Loading trajectories from ../../data/aisdk/processed/val_trajectories.npz ...
  → Found 77 trajectories
Stacking windows into numpy arrays...
Saving windows to ../../data/aisdk/processed/windows/val_trajectories.npz ...
DONE!
  → Total windows: 36719
Loading trajectories from ../../data/aisdk/processed/test_trajectories.npz ...
  → Found 78 trajectories
  → Processed 50/78 trajectories
Stacking windows into numpy arrays...
Saving windows to ../../data/aisdk/processed/windows/test_trajectories.n

Fetch the data

In [33]:
train_traj = np.load(os.path.join(BASE, "windows/train_trajectories.npz"))
val_traj = np.load(os.path.join(BASE, "windows/val_trajectories.npz"))
test_traj = np.load(os.path.join(BASE, "windows/test_trajectories.npz"))

X_train, X_val = train_traj["past"], val_traj["past"]
y_train, y_val = train_traj["cluster"], val_traj["cluster"]

In [39]:
train_trajj = pd.DataFrame(train_traj)

In [46]:
data = np.load("../../data/aisdk/processed/train_trajectories.npz", allow_pickle=True)
trajs = data["trajectories"]  # object array, each traj[i] is (T_i, F)
num_traj = len(trajs)
print(num_traj)

359


In [47]:
y_train.shape

(315,)