In [34]:
from __future__ import annotations
import time
import warnings
from pathlib import Path
from typing import Dict

import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from tqdm.auto import tqdm

In [35]:
tqdm.pandas()

In [36]:
warnings.filterwarnings("ignore", category=UserWarning)

# ───────────────────────────── configuration ────────────────────────────── #
# List of trip IDs to drop from the dataset
DROP_TRIPS      = [10257]
# Base columns used for training the OC-SVM
BASE_COLUMNS    = [
    "speed_over_ground", "dv", "dcourse", "ddraft",
    "zone",
    "x_km", "y_km", "dist_to_ref", "route_dummy"
]

# Geographical zones defined by [lat_max, lat_min, lon_max, lon_min]
ZONES           = [[53.8, 53.5, 8.6, 8.14], [53.66, 53.0, 11.0, 9.5]]
# Grid of 'nu' (hyperparameter for OC-SVM) values to search
NU_GRID         = [0.01, 0.03]
# Fraction of normal points to include in the test set
TEST_FRACTION_N = 0.10
# Radii (in km) defining "port" and "approach" zones (currently unused in logic)
R_PORT, R_APP   = 5.0, 15.0
# Earth radius in km, used for haversine distance calculation
EARTH_R         = 6_371.0

In [37]:
def haversine(lat1, lon1, lat2, lon2):
    """Vectorized haversine distance (km)."""
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    return 2 * EARTH_R * np.arcsin(np.sqrt(a))

In [38]:
def load_and_prepare(path: str) -> pd.DataFrame:
    """
    Load parquet, drop specified trips, parse dates, compute delta-features,
    map labels and one-hot port zones.
    """
    df = pd.read_parquet(path, engine="pyarrow")
    print(f"Loaded {len(df):,} rows, dropping {len(df[df.trip_id.isin(DROP_TRIPS)]):,} rows from {DROP_TRIPS}")
    df = df[~df.trip_id.isin(DROP_TRIPS)].reset_index(drop=True)

    for col in ("start_time", "end_time", "time_stamp"):
        df[col] = pd.to_datetime(df[col])

    df = df.dropna(subset=["ship_type"]).reset_index(drop=True)
    df["y_true"]   = df["is_anomaly"].map({True: 1, False: 0})
    df["route_id"] = df["start_port"] # Using start_port as route_id for this pipeline

    # per-point deltas
    df = df.sort_values(["trip_id", "time_stamp"])
    df["dv"]      = df.groupby("trip_id")["speed_over_ground"].diff().abs().fillna(0)
    df["dcourse"] = df.groupby("trip_id")["course_over_ground"].diff().abs().fillna(0)
    df["ddraft"]  = df.groupby("trip_id")["draught"].diff().abs().fillna(0)

    # zones
    # port_coords is defined but not used in zone_label; it might be a remnant
    port_coords = (
        df.groupby("start_port")[["start_latitude", "start_longitude"]]
          .first()
          .to_dict("index")
    )

    def _in_any_rect(lat: float, lon: float) -> bool:
        """Checks if a given lat/lon is within any of the defined rectangular zones."""
        for lat_max, lat_min, lon_max, lon_min in ZONES:
            if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max:
                return True
        return False

    def zone_label(row) -> int:
        """Assigns a zone label (0 for within zone, 1 for outside)."""
        if _in_any_rect(row.latitude, row.longitude):
            return 0 # In a defined zone
        return 1 # Outside all defined zones

    df["zone"] = df.progress_apply(zone_label, axis=1)
    # One-hot encode the 'zone' column. Note: this line is creating 'zone_0' and 'zone_1'
    # but only 'zone' (the numerical label) is used in BASE_COLUMNS.
    df = pd.concat([df, pd.get_dummies(df["zone"], prefix="zone")], axis=1)
    return df

In [39]:
def compute_average_route(df_route: pd.DataFrame, n_points: int = 100) -> np.ndarray:
    """
    Compute average trajectory for a route by resampling each trip to n_points
    along cumulative distance fraction, then averaging.
    """
    segments = []
    for _, trip in df_route.groupby("trip_id"):
        trip = trip.sort_values("time_stamp")
        lat, lon = trip.latitude.to_numpy(), trip.longitude.to_numpy()
        d = haversine(lat[1:], lon[1:], lat[:-1], lon[:-1])
        cum = np.concatenate(([0], np.cumsum(d)))
        if cum[-1] <= 0: # Handle cases with no movement within the trip
            continue
        frac   = cum / cum[-1]
        target = np.linspace(0, 1, n_points)
        segments.append(np.vstack([np.interp(target, frac, lat),
                                   np.interp(target, frac, lon)]).T)
    if not segments: # If no valid segments were found for the route
        return np.array([]) # Return an empty array or handle as error
    return np.mean(np.stack(segments, axis=0), axis=0)

In [49]:
def add_route_specific_features(df: pd.DataFrame, route: str) -> pd.DataFrame:
    """
    For a single route:
    • project lat/lon to local x_km, y_km
    • compute distance to average route (dist_to_ref)
    • add constant route_dummy = 1
    """
    df_r = df[df.route_id == route].copy()

    # local projection
    lat0, lon0 = df_r.latitude.mean(), df_r.longitude.mean()
    kx = 111.320 * np.cos(np.deg2rad(lat0)) # Conversion factor for longitude to km
    ky = 110.574 # Conversion factor for latitude to km
    df_r["x_km"] = (df_r.longitude - lon0) * kx
    df_r["y_km"] = (df_r.latitude  - lat0) * ky

    # distance to average trajectory
    avg = compute_average_route(df_r)
    if avg.size == 0: # Handle case where average route couldn't be computed
        df_r["dist_to_ref"] = 0.0 # Or fill with NaN, depending on desired behavior
        df_r["route_dummy"] = 1.0
        return df_r

    idx_map = df_r.index
    frac = np.zeros(len(df_r))
    for _, trip in tqdm(df_r.groupby("trip_id"), desc=f"Processing trips for route {route}"):
        pos = idx_map.get_indexer(trip.index)
        lat, lon = trip.latitude.values, trip.longitude.values
        d = haversine(lat[1:], lon[1:], lat[:-1], lon[:-1])
        cum = np.concatenate(([0], np.cumsum(d)))
        total = cum[-1] if cum[-1] > 0 else 1
        frac[pos] = cum / total

    # Ensure index for avg is within bounds (0 to 99 for 100 points)
    df_r["dist_to_ref"] = [
        haversine(lat, lon, avg[int(f * 99), 0], avg[int(f * 99), 1])
        for lat, lon, f in zip(df_r.latitude, df_r.longitude, frac)
    ]
    df_r["route_dummy"] = 1.0
    return df_r

In [50]:
out_dir: str = "models_per_route"
df = load_and_prepare("all_trips_saved.parquet")

Loaded 912,566 rows, dropping 577 rows from [10257]


  0%|          | 0/911621 [00:00<?, ?it/s]

In [51]:
route = df.route_id.unique()[0]
# route = df[df.route_id == df.route_id.unique()[1]]
route

'KIEL'

In [52]:
Path(out_dir).mkdir(exist_ok=True)
dispatcher: Dict[str, str] = {}

t0 = time.time()
print(f"\n=== Training route: {route} ===")

fr = add_route_specific_features(df, route)
X_norm = fr[fr.y_true == 0][BASE_COLUMNS].fillna(0).values
X_anom = fr[fr.y_true == 1][BASE_COLUMNS].fillna(0).values

if len(X_norm) == 0:
    print("  * No normal points, skipping this route.")


=== Training route: KIEL ===


Processing trips for route KIEL:   0%|          | 0/420 [00:00<?, ?it/s]

In [55]:
# ─── prepare test set: all anomalies + fraction of normals ───
idx_anom    = fr[fr.y_true == 1].index.to_numpy()
n_norm_test = max(1, int(TEST_FRACTION_N * len(X_norm)))
idx_norm    = fr[fr.y_true == 0].sample(n=n_norm_test, random_state=42).index.to_numpy()

X_test = np.vstack([
    fr.loc[idx_anom, BASE_COLUMNS].fillna(0).values,
    fr.loc[idx_norm, BASE_COLUMNS].fillna(0).values
])
y_test = np.concatenate([
    np.ones(len(idx_anom), dtype=int),
    np.zeros(len(idx_norm), dtype=int)
])

# mask for port-zone override
zone = np.concatenate([
    fr.loc[idx_anom, "zone"].to_numpy(),
    fr.loc[idx_norm, "zone"].to_numpy()
]).astype(bool)

In [58]:
# ─── grid-search over ν ───
best = {"auc": -np.inf}
for nu in NU_GRID:
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ocsvm", OneClassSVM(kernel="rbf", gamma="scale", nu=nu, verbose=True, shrinking=False))
    ])

    pipe.fit(X_norm)
    scores_train = -pipe.decision_function(X_norm)
    tau = np.percentile(scores_train, 100 * (1 - nu))

    scores_test = -pipe.decision_function(X_test)
    preds = (scores_test > tau).astype(int)

    # force port-zone → always normal
   ## preds[zone] = 0

    auc = roc_auc_score(y_test, scores_test) if len(np.unique(y_test)) > 1 else 0.0
    print(f"  ν={nu:<4}  τ={tau:6.3f}  AUC={auc:5.3f}")

    if auc > best["auc"]:
        best.update(pipe=pipe, nu=nu, tau=tau, auc=auc)

[LibSVM].*
optimization finished, #iter = 1331
obj = 7555.703096, rho = 21.893586
nSV = 864, nBSV = 738
  ν=0.01  τ= 0.000  AUC=0.983
[LibSVM]..*
optimization finished, #iter = 2726
obj = 98960.589020, rho = 98.176457
nSV = 2430, nBSV = 2367
  ν=0.03  τ= 0.000  AUC=0.988


In [59]:
best

{'auc': 0.987952131049186,
 'pipe': Pipeline(steps=[('scaler', StandardScaler()),
                 ('ocsvm', OneClassSVM(nu=0.03, shrinking=False, verbose=True))]),
 'nu': 0.03,
 'tau': 4.950973137741667e-05}

In [60]:
# ─── final evaluation & save  ───
print(f"\n-> Selected ν={best['nu']}  τ={best['tau']:.3f}  AUC={best['auc']:.3f}")
scores_test = -best["pipe"].decision_function(X_test)
preds = (scores_test > best["tau"]).astype(int)
# preds[zone] = 0

print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds, digits=3))
print(f"Route {route} done in {time.time() - t0:.1f}s\n")

model_path = Path(out_dir) / f"ocsvm_{route}.pkl"
joblib.dump(
    {
        "pipeline": best["pipe"],
        "features": BASE_COLUMNS,
        "tau": best["tau"],
    },
    model_path,
)
dispatcher[route] = str(model_path)


-> Selected ν=0.03  τ=0.000  AUC=0.988
[[7729  253]
 [  24  554]]
              precision    recall  f1-score   support

           0      0.997     0.968     0.982      7982
           1      0.686     0.958     0.800       578

    accuracy                          0.968      8560
   macro avg      0.842     0.963     0.891      8560
weighted avg      0.976     0.968     0.970      8560

Route KIEL done in 294.2s



In [61]:

# save dispatcher
joblib.dump(dispatcher, Path(out_dir) / "dispatcher.pkl")
print("All models saved, dispatcher.pkl created.")

All models saved, dispatcher.pkl created.
