In [None]:
# LSTM Vessel Trajectory Anomaly Detection Pipeline
import time
import warnings
from pathlib import Path
from typing import Dict, Tuple
import joblib
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm

# Deep Learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import Sequence

from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, classification_report
)


In [None]:
DROP_TRIPS = [10257]

#Used features
BASE_COLUMNS = [
    "speed_over_ground", "dv", "dcourse", "ddraft",
    "zone", "x_km", "y_km", "dist_to_ref", "route_dummy"
]

ZONES = [[53.8, 53.5, 8.6, 8.14], [53.66, 53.0, 11.0, 9.5]]
R_PORT, R_APP = 5.0, 15.0
RANDOM_STATE = 42


tqdm.pandas()
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
sklearn.random.seed(RANDOM_STATE)

TF_ENABLE_ONEDNN_OPTS=0
warnings.filterwarnings("ignore", category=UserWarning)

───────────────────────────── Custom Data Generator ──────────────────────────────

In [None]:
class VesselSequenceGenerator(Sequence):
    """Custom data generator for LSTM training with vessel trajectory sequences."""

    def __init__(self, data, labels, sequence_length, batch_size, shuffle=True):
        self.data = data
        self.labels = labels
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(data))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X = np.array([self.data[i] for i in batch_indices])
        y = np.array([self.labels[i] for i in batch_indices])
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

───────────────────────────── Sequence Creation ──────────────────────────────

In [None]:
def create_sequences_from_trips(df_route: pd.DataFrame, sequence_length: int,
                               overlap_ratio: float = 0.5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Create sequences from trip data for LSTM training.
    Returns sequences, labels, and trip_ids for each sequence.
    """
    sequences = []
    labels = []
    trip_ids = []

    step_size = max(1, int(sequence_length * (1 - overlap_ratio)))

    for trip_id, trip_data in tqdm(df_route.groupby('trip_id'), desc="Creating sequences"):
        trip_data = trip_data.sort_values('time_stamp').reset_index(drop=True)

        # Get features and labels
        features = trip_data[BASE_COLUMNS].fillna(0).values
        trip_labels = trip_data['y_true'].values

        # Create overlapping sequences
        for start_idx in range(0, len(features) - sequence_length + 1, step_size):
            end_idx = start_idx + sequence_length
            seq_features = features[start_idx:end_idx]
            seq_labels = trip_labels[start_idx:end_idx]

            # Label sequence as anomalous if any point in sequence is anomalous
            sequence_label = int(np.any(seq_labels == 1))

            sequences.append(seq_features)
            labels.append(sequence_label)
            trip_ids.append(trip_id)

    return np.array(sequences), np.array(labels), np.array(trip_ids)

-───────────────────────────── LSTM Model Definitions ──────────────────────────────

In [None]:
def build_lstm_autoencoder(input_shape: Tuple[int, int], lstm_units: int = 64,
                          dense_units: int = 32, dropout_rate: float = 0.2) -> Model:
    """
    Build LSTM Autoencoder for anomaly detection.
    """
    # Encoder
    input_layer = Input(shape=input_shape)
    encoded = LSTM(lstm_units, return_sequences=True)(input_layer)
    encoded = Dropout(dropout_rate)(encoded)
    encoded = LSTM(lstm_units // 2, return_sequences=False)(encoded)
    encoded = Dropout(dropout_rate)(encoded)

    # Bottleneck
    bottleneck = Dense(dense_units, activation='relu')(encoded)
    bottleneck = Dropout(dropout_rate)(bottleneck)

    # Decoder
    decoded = Dense(lstm_units // 2, activation='relu')(bottleneck)
    decoded = Dropout(dropout_rate)(decoded)
    decoded = tf.keras.layers.RepeatVector(input_shape[0])(decoded)
    decoded = LSTM(lstm_units // 2, return_sequences=True)(decoded)
    decoded = Dropout(dropout_rate)(decoded)
    decoded = LSTM(lstm_units, return_sequences=True)(decoded)
    decoded = TimeDistributed(Dense(input_shape[1]))(decoded)

    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [None]:
def build_lstm_classifier(input_shape: Tuple[int, int], lstm_units: int = 64,
                         dense_units: int = 32, dropout_rate: float = 0.2) -> Model:
    """
    Build LSTM Classifier for anomaly detection.
    """
    model = Sequential([
        LSTM(lstm_units, return_sequences=True, input_shape=input_shape),
        Dropout(dropout_rate),
        LSTM(lstm_units // 2, return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    return model

In [None]:
def train_lstm_autoencoder(X_train_normal: np.ndarray,
                           X_val: np.ndarray,
                           model_path: str,
                           epochs: int) -> Tuple[Model, Dict]:
    """
    Train LSTM Autoencoder on normal data only.
    """
    input_shape = (X_train_normal.shape[1], X_train_normal.shape[2])
    model = build_lstm_autoencoder(input_shape, LSTM_UNITS, DENSE_UNITS, DROPOUT_RATE)

    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss='mse', metrics=['mae'])

    callbacks = [
        EarlyStopping(patience=PATIENCE, restore_best_weights=True, monitor='val_loss'),
        ModelCheckpoint(model_path.replace('.h5', '.keras'), save_best_only=True, monitor='val_loss'),
        ReduceLROnPlateau(factor=0.5, patience=PATIENCE//2, min_lr=1e-6, monitor='val_loss')
    ]

    history = model.fit(
        X_train_normal, X_train_normal,  # Autoencoder: input = output
        validation_data=(X_val, X_val),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )

    return model, history.history

In [None]:
def train_lstm_classifier(X_train: np.ndarray, y_train: np.ndarray,
                         X_val: np.ndarray, y_val: np.ndarray,
                         model_path: str,
                         epochs: int) -> Tuple[Model, Dict]:
    """
    Train LSTM Classifier for binary anomaly classification.
    """
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = build_lstm_classifier(input_shape, LSTM_UNITS, DENSE_UNITS, DROPOUT_RATE)

    # Handle class imbalance
    class_weight = {
        0: len(y_train) / (2 * np.sum(y_train == 0)),
        1: len(y_train) / (2 * np.sum(y_train == 1))
    }

    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'precision', 'recall'])

    callbacks = [
        EarlyStopping(patience=PATIENCE, restore_best_weights=True, monitor='val_loss'),
        ModelCheckpoint(model_path.replace('.h5', '.keras'), save_best_only=True, monitor='val_loss'),
        ReduceLROnPlateau(factor=0.5, patience=PATIENCE//2, min_lr=1e-6, monitor='val_loss')
    ]

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        class_weight=class_weight,
        verbose=1
    )

    return model, history.history

In [None]:
def evaluate_model(model, X_test: np.ndarray, y_test: np.ndarray,
                   model_type: str = 'classifier',
                   threshold = None) -> Dict:
    """
    Evaluate anomaly detection model (autoencoder or classifier).

    Args:
        model: trained model
        X_test: input test data
        y_test: true labels (0 = normal, 1 = anomaly)
        model_type: 'autoencoder' or 'classifier'
        threshold: optional float to override default threshold

    Returns:
        Dict with evaluation metrics and predictions
    """
    if model_type == 'autoencoder':
        X_pred = model.predict(X_test, verbose=0)
        reconstruction_errors = np.mean(np.square(X_test - X_pred), axis=(1, 2))
        y_scores = reconstruction_errors

        if threshold is None:
            threshold = np.percentile(reconstruction_errors, 95)

        y_pred = (reconstruction_errors > threshold).astype(int)

    else:  # classifier
        y_pred_prob = model.predict(X_test, verbose=0).flatten()
        y_scores = y_pred_prob

        if threshold is None:
            threshold = 0.5

        y_pred = (y_pred_prob > threshold).astype(int)

    # Metrics
    metrics = {
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_pred, zero_division=0),
        'pr_auc': average_precision_score(y_test, y_scores),
        'roc_auc': roc_auc_score(y_test, y_scores) if len(np.unique(y_test)) > 1 else None,
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'threshold_used': threshold,
        'y_pred': y_pred.tolist(),
        'y_scores': y_scores.tolist()
    }

    return metrics


───────────────────────────── Main Training Pipeline ──────────────────────────────


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit


from sklearn.model_selection import StratifiedShuffleSplit

def stratified_trip_split(trip_ids, labels, train_ratio=0.7, val_ratio=0.15, seed=42):
    """
    Splits trip IDs into train/val/test ensuring a balanced distribution of anomalous/normal trips.

    Args:
        trip_ids: Array of trip IDs for each sample
        labels: Array of labels (0 or 1) for each sample
        train_ratio: Proportion of data for training
        val_ratio: Proportion of data for validation
        seed: Random seed for reproducibility

    Returns:
        train_mask, val_mask, test_mask: Boolean masks for the input arrays
    """
    unique_trips = np.unique(trip_ids)
    trip_labels = []

    for trip in unique_trips:
        is_anomalous = np.any(labels[trip_ids == trip])
        trip_labels.append(1 if is_anomalous else 0)

    trip_labels = np.array(trip_labels)

    sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - train_ratio, random_state=seed)
    train_idx, rest_idx = next(sss.split(unique_trips, trip_labels))

    rest_trips = unique_trips[rest_idx]
    rest_labels = trip_labels[rest_idx]

    val_ratio_adjusted = val_ratio / (1 - train_ratio)
    sss_val = StratifiedShuffleSplit(n_splits=1, test_size=1 - val_ratio_adjusted, random_state=seed)
    val_idx, test_idx = next(sss_val.split(rest_trips, rest_labels))

    train_trips = unique_trips[train_idx]
    val_trips = rest_trips[val_idx]
    test_trips = rest_trips[test_idx]

    train_mask = np.isin(trip_ids, train_trips)
    val_mask = np.isin(trip_ids, val_trips)
    test_mask = np.isin(trip_ids, test_trips)

    return train_mask, val_mask, test_mask

In [None]:
def print_route_results( test_results: dict) -> None:
    """
    Print formatted results for a specific route and model type.

    Args:
        route (str): Route name.
        model_type (str): Type of model (e.g., 'classifier', 'autoencoder').
        route_results (dict): Dictionary containing training results (e.g., training time).
        test_results (dict): Dictionary containing test evaluation metrics.
    """

    print(f"  * Precision  = TP / (TP + FP)        | Correctly flagged anomalies")
    print(f"    → {test_results['precision']:.4f}")

    print(f"  * Recall     = TP / (TP + FN)        | Detected actual anomalies")
    print(f"    → {test_results['recall']:.4f}")

    print(f"  * F1 Score   = Harmonic mean         | Balance of precision & recall")
    print(f"    → {test_results['f1_score']:.4f}")

    print(f"  * PR AUC     = Precision-Recall AUC  | Better for rare anomalies")
    print(f"    → {test_results['pr_auc']:.4f}")

    if test_results['roc_auc'] is not None:
        print(f"  * ROC AUC    = ROC Curve AUC         | Overall separability of classes")
        print(f"    → {test_results['roc_auc']:.4f}")


In [None]:
def train_lstm_pipeline(data_path: str,
                        output_dir: str,
                        epochs: int,
                        model_type: str = 'classifier'
                        ):
    """
    Main training pipeline for LSTM-based vessel anomaly detection.

    Args:
        data_path: Path to the parquet file
        output_dir: Directory to save models and results
        model_type: 'classifier' or 'autoencoder'
        epochs: number of epochs to train the model
    """
    print(f"Starting LSTM {model_type} training pipeline...")

    # Load and prepare data
    df = pd.read_parquet(data_path)
    Path(output_dir).mkdir(exist_ok=True)

    results = {}

    for route in df.route_id.unique():
        print(f"\n=== Training LSTM {model_type} for route: {route} ===")
        t0 = time.time()

        # Prepare route data
        fr =  df[df.route_id == route].copy()

        # Create sequences
        sequences, labels, trip_ids = create_sequences_from_trips(fr, SEQUENCE_LENGTH, OVERLAP_RATIO)

        if len(sequences) == 0:
            print(f"  * No sequences created for route {route}, skipping.")
            continue

        print(f"  * Created {len(sequences)} sequences")
        print(f"  * Normal sequences: {np.sum(labels == 0)}, Anomalous sequences: {np.sum(labels == 1)}")

        # Scale features
        scaler = MinMaxScaler()
        n_samples, n_timesteps, n_features = sequences.shape
        sequences_scaled = scaler.fit_transform(sequences.reshape(-1, n_features)).reshape(n_samples, n_timesteps, n_features)

        train_mask, val_mask, test_mask = stratified_trip_split(trip_ids, labels)

        X_train, y_train = sequences_scaled[train_mask], labels[train_mask]
        X_val, y_val = sequences_scaled[val_mask], labels[val_mask]
        X_test, y_test = sequences_scaled[test_mask], labels[test_mask]

        # Train model
        model_path = f"{output_dir}/lstm_{model_type}_{route}_{epochs}.h5"

        if model_type == 'autoencoder':
            # Train only on normal sequences for autoencoder
            X_train_normal = X_train[y_train == 0]
            X_val_normal = X_val[y_val == 0]

            if len(X_train_normal) == 0:
                print(f"  * No normal training sequences for route {route}, skipping.")
                continue

            model, history = train_lstm_autoencoder(X_train_normal, X_val_normal, model_path, epochs)
        else:
            if len(np.unique(y_train)) < 2:
                print(f"  * Insufficient class diversity for route {route}, skipping.")
                continue

            model, history = train_lstm_classifier(X_train, y_train, X_val, y_val, model_path, epochs)

        # Evaluate model
        test_results = evaluate_model(model, X_test, y_test, model_type)

        # Save results
        route_results = {
            'route': route,
            'model_type': model_type,
            'training_time': time.time() - t0,
            'n_sequences': len(sequences),
            'n_train': len(X_train),
            'n_val': len(X_val),
            'n_test': len(X_test),
            'test_metrics': test_results,
            'training_history': history
        }

        results[route] = route_results

        # Save scaler
        joblib.dump(scaler, f"{output_dir}/scaler_{route}_{epochs}.pkl")
        print("=" * 5 + f"Results for route {route} with {model_type} " + "=" * 5)
        print(f"  * Training completed in {route_results['training_time']:.1f}s")
        print_route_results(test_results)
        print("=" * 50)

    # Save overall results
    joblib.dump(results, f"{output_dir}/training_results_{epochs}.pkl")
    print(f"\nTraining pipeline completed. Results saved to {output_dir}")

    return results


-───────────────────────────── LSTM Training Start ──────────────────────────────

In [None]:
# LSTM-specific parameters
SEQUENCE_LENGTH = 50  # Number of time steps to look back
OVERLAP_RATIO = 0.5   # Overlap between sequences (0.5 = 50% overlap)
LSTM_UNITS = 64       # Number of LSTM units
DENSE_UNITS = 32      # Dense layer units
DROPOUT_RATE = 0.2    # Dropout rate
BATCH_SIZE = 32       # Training batch size
PATIENCE = 15         # Early stopping patience
LEARNING_RATE = 0.001 # Learning rate

data_path = "data/LSTM_preprocessed.parquet"
output_dir = "lstm_models"

EPOCHS = 1

In [None]:
# Train classifier
results_classifier = train_lstm_pipeline(data_path, f"{output_dir}_classifier", EPOCHS, "classifier")

In [None]:
# Train autoencoder
results_autoencoder = train_lstm_pipeline(data_path, f"{output_dir}_autoencoder", EPOCHS, "autoencoder")