In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

**New added features:**
- dv --- Delta speed over ground
- dcourse --- Delta course over ground (degrees)
- ddraft --- Delta draught
- zone ---          ?
- x_km --- Local x coordinate in km (relative to route center)
- y_km --- Local y coordinate in km (relative to route center)
- dist_to_ref --- Distance to average route trajectory (km)
- route_dummy --- ?

In [2]:
tqdm.pandas()

DROP_TRIPS = [10257]

ZONES = [[53.8, 53.5, 8.6, 8.14], [53.66, 53.0, 11.0, 9.5]]

# Distance thresholds for port proximity analysis
R_PORT, R_APP = 5.0, 15.0       # Port radius and approach radius in km
EARTH_R = 6_371.0              # Earth radius in km for distance calculations
RANDOM_STATE = 42

───────────────────────────── Helper Functions ──────────────────────────────

In [3]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate great-circle distance between points using Haversine formula.

    This is essential for maritime data as vessels travel on Earth's curved surface.
    Used for computing distances between GPS coordinates and measuring trajectory
    deviations from reference routes.

    Args:
        lat1, lon1: Latitude and longitude of first point(s) in degrees
        lat2, lon2: Latitude and longitude of second point(s) in degrees

    Returns:
        Distance in kilometers (vectorized for arrays)
    """
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))

    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    return 2 * EARTH_R * np.arcsin(np.sqrt(a))

In [4]:
def load_and_prepare(path: str) -> pd.DataFrame:
    """
    Load and perform initial data preparation.

    This function handles the core preprocessing steps:
    1. Data loading and cleaning (removing problematic trips)
    2. Date parsing for temporal analysis
    3. Delta feature computation for detecting sudden changes
    4. Zone classification

    Delta features are crucial for anomaly detection as they capture:
    - Sudden speed changes (acceleration/deceleration anomalies)
    - Course changes (navigation anomalies)
    - Draft changes (loading/unloading anomalies)

    Args:
        path: Path to the parquet file containing vessel trajectory data

    Returns:
        Preprocessed DataFrame with delta features and zone classifications
    """

    #--- Step 1: Load data and remove problematic trips
    df = pd.read_parquet(path, engine="pyarrow")
    print(f"Loaded {len(df):,} rows, dropping {len(df[df.trip_id.isin(DROP_TRIPS)]):,} rows from {DROP_TRIPS}")
    df = df[~df.trip_id.isin(DROP_TRIPS)].reset_index(drop=True)

    #--- Step 2: Parse datetime columns for temporal analysis
    for col in ("start_time", "end_time", "time_stamp"):
        df[col] = pd.to_datetime(df[col])

    # Create binary anomaly labels for supervised learning
    df["y_true"] = df["is_anomaly"].map({True: 1, False: 0})
    df["route_id"] = df["start_port"] # Route identifier for route-specific processing

    # Compute delta features (changes between consecutive points)
    # These capture sudden behavioral changes that may indicate anomalies
    df = df.sort_values(["trip_id", "time_stamp"])
    df["dv"] = df.groupby("trip_id")["speed_over_ground"].diff().abs().fillna(0)
    df["dcourse"] = df.groupby("trip_id")["course_over_ground"].diff().abs().fillna(0)
    df["ddraft"] = df.groupby("trip_id")["draught"].diff().abs().fillna(0)

    def _in_any_rect(lat: float, lon: float) -> bool:
        """Check if coordinates are within any defined port zone."""
        for lat_max, lat_min, lon_max, lon_min in ZONES:
            if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max:
                return True
        return False

    def zone_label(row) -> int:
        """
        Classify vessel position as port area (0) or open water (1).

        This spatial context is important because:
        - Port areas have different traffic patterns and regulations
        - Anomalous behavior may be normal in ports but suspicious in open water
        - LSTM can learn different behavioral patterns for different zones
        """
        if _in_any_rect(row.latitude, row.longitude):
            return 0
        return 1

    # Apply zone classification with progress tracking
    df["zone"] = df.progress_apply(zone_label, axis=1)
    # Create one-hot encoded zone features for the model
    df = pd.concat([df, pd.get_dummies(df["zone"], prefix="zone")], axis=1)
    return df

───────────────────────────── Route Features ──────────────────────────────

In [5]:
def compute_average_route(df_route: pd.DataFrame, n_points: int = 100) -> np.ndarray:
    """
    Compute average trajectory for a specific route.

    This creates a reference trajectory by:
    1. Resampling each trip to a fixed number of points along the route
    2. Normalizing by cumulative distance fraction (0 to 1)
    3. Averaging corresponding points across all trips

    The reference trajectory is essential for measuring deviations:
    - Normal trips should follow the average route closely
    - Anomalous trips may deviate significantly from the reference

    Args:
        df_route: DataFrame containing all trips for a specific route
        n_points: Number of points to use for the reference trajectory

    Returns:
        Array of shape (n_points, 2) containing [lat, lon] of reference trajectory
    """
    segments = []

    for _, trip in df_route.groupby("trip_id"):
        trip = trip.sort_values("time_stamp")
        lat, lon = trip.latitude.to_numpy(), trip.longitude.to_numpy()

        # Calculate cumulative distance along the trip
        d = haversine(lat[1:], lon[1:], lat[:-1], lon[:-1])
        cum = np.concatenate(([0], np.cumsum(d)))

        if cum[-1] <= 0:  # Skip trips with no movement
            continue

        # Normalize to fraction of total trip distance
        frac = cum / cum[-1]

        # Resample to fixed number of points
        target = np.linspace(0, 1, n_points)
        segments.append(np.vstack([np.interp(target, frac, lat),
                                   np.interp(target, frac, lon)]).T)
    if not segments:
        return np.array([])

    # Average corresponding points across all trips
    return np.mean(np.stack(segments, axis=0), axis=0)

def add_route_specific_features(df: pd.DataFrame, route: str) -> pd.DataFrame:
    """
    Add route-specific features for improved anomaly detection.

    This function creates:
    1. Local coordinate projection (x_km, y_km) for distance calculations
    2. Distance to reference trajectory for measuring route deviations

    Route-specific features are important because:
    - Different routes have different normal patterns
    - Local coordinates make distance calculations more accurate
    - Deviation from reference trajectory is a strong anomaly indicator

    Args:
        df: Full DataFrame with all routes
        route: Specific route to process

    Returns:
        DataFrame subset with route-specific features added
    """

    df_r = df[df.route_id == route].copy()

    # Create local coordinate system centered on route's geographic center
    # This improves distance calculation accuracy over large areas
    lat0, lon0 = df_r.latitude.mean(), df_r.longitude.mean()
    kx = 111.320 * np.cos(np.deg2rad(lat0))     # Longitude to km (varies with latitude)
    ky = 110.574                                # Latitude to km (roughly constant)
    df_r["x_km"] = (df_r.longitude - lon0) * kx
    df_r["y_km"] = (df_r.latitude - lat0) * ky

    # Compute reference trajectory and distance deviations
    avg = compute_average_route(df_r)
    if avg.size == 0: # Handle routes with insufficient data
        df_r["dist_to_ref"] = 0.0
        df_r["route_dummy"] = 1.0
        return df_r

    # Calculate distance to reference trajectory for each point
    idx_map = df_r.index
    frac = np.zeros(len(df_r)) # Fraction along route for each point

    for _, trip in tqdm(df_r.groupby("trip_id"), desc=f"Processing trips for route {route}"):
        pos = idx_map.get_indexer(trip.index)
        lat, lon = trip.latitude.values, trip.longitude.values
        d = haversine(lat[1:], lon[1:], lat[:-1], lon[:-1])

        # Calculate cumulative distance along this trip
        cum = np.concatenate(([0], np.cumsum(d)))
        total = cum[-1] if cum[-1] > 0 else 1
        frac[pos] = cum / total

    # Compute distance from each point to corresponding reference point
    df_r["dist_to_ref"] = [
        haversine(lat, lon, avg[int(f * 99), 0], avg[int(f * 99), 1])
        for lat, lon, f in zip(df_r.latitude, df_r.longitude, frac)
    ]
    df_r["route_dummy"] = 1.0
    return df_r

───────────────────────────── Main Preprocessing ──────────────────────────────

In [6]:
def get_prepared_dfs(data_path: str):
    """
    Process all routes and return list of prepared DataFrames.

    This function applies route-specific processing to each route separately,
    which is important because different routes have different:
    - Geographic characteristics
    - Traffic patterns
    - Normal operational behaviors

    Args:
        data_path: Path to input data file

    Returns:
        List of DataFrames, one per route, with all features computed
    """
    df = load_and_prepare(data_path)

    dfs = []
    for route in df.route_id.unique():
        fr = add_route_specific_features(df, route)
        dfs.append(fr)

    return dfs

def preprocess_df(data_path: str, output_dir: str,
                  name: str = "LSTM_preprocessed") -> pd.DataFrame:
    """
    Complete preprocessing pipeline for vessel trajectory data.

    This is the main function that orchestrates the entire preprocessing:
    1. Load and clean raw data
    2. Compute delta features and zone classifications
    3. Process each route separately for route-specific features
    4. Combine all routes into final dataset
    5. Save preprocessed data for model training

    The resulting dataset is optimized for LSTM training with:
    - Sequential temporal structure (sorted by trip and time)
    - Engineered features capturing behavioral changes
    - Spatial context through zone and distance features
    - Route-specific normalization

    Args:
        data_path: Path to input parquet file
        output_dir: Directory to save processed data
        name: Name for output file (without extension)

    Returns:
        Final preprocessed DataFrame ready for LSTM training
    """
    Path(output_dir).mkdir(exist_ok=True)
    model_path = f"{output_dir}/{name}.parquet"

    dfs = get_prepared_dfs(data_path)
    df_final = pd.concat(dfs, ignore_index=True)
    df_final.sort_values(["trip_id", "time_stamp"], inplace=True)

    df_final.to_parquet(model_path, index=False)
    print(f"Saved processed data to {model_path}")
    return df_final

In [7]:
data_path = "../../data/cleaned/connected_labeled_anomalies.parquet"
output_dir = "data"

df_final = preprocess_df(data_path, output_dir)

Loaded 913,595 rows, dropping 577 rows from [10257]


100%|██████████| 913018/913018 [00:15<00:00, 60204.44it/s]
Processing trips for route KIEL: 100%|██████████| 423/423 [00:00<00:00, 1434.89it/s]
Processing trips for route BREMERHAVEN: 100%|██████████| 702/702 [00:00<00:00, 2313.36it/s]


Saved processed data to data/LSTM_preprocessed.parquet
