In [1]:
# LSTM Vessel Trajectory Anomaly Detection Pipeline
import time
import warnings
from pathlib import Path
from typing import Dict, Tuple
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm

# Deep Learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import Sequence

2025-06-18 18:07:24.490203: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-18 18:07:24.527162: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
DROP_TRIPS = [10257]

#Used features
BASE_COLUMNS = [
    "speed_over_ground", "dv", "dcourse", "ddraft",
    "zone", "x_km", "y_km", "dist_to_ref", "route_dummy"
]

ZONES = [[53.8, 53.5, 8.6, 8.14], [53.66, 53.0, 11.0, 9.5]]
R_PORT, R_APP = 5.0, 15.0
RANDOM_STATE = 42


tqdm.pandas()
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
TF_ENABLE_ONEDNN_OPTS=0
warnings.filterwarnings("ignore", category=UserWarning)

───────────────────────────── Custom Data Generator ──────────────────────────────

In [3]:
class VesselSequenceGenerator(Sequence):
    """Custom data generator for LSTM training with vessel trajectory sequences."""

    def __init__(self, data, labels, sequence_length, batch_size, shuffle=True):
        self.data = data
        self.labels = labels
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(data))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X = np.array([self.data[i] for i in batch_indices])
        y = np.array([self.labels[i] for i in batch_indices])
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

───────────────────────────── Sequence Creation ──────────────────────────────

In [4]:
def create_sequences_from_trips(df_route: pd.DataFrame, sequence_length: int,
                               overlap_ratio: float = 0.5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Create sequences from trip data for LSTM training.
    Returns sequences, labels, and trip_ids for each sequence.
    """
    sequences = []
    labels = []
    trip_ids = []

    step_size = max(1, int(sequence_length * (1 - overlap_ratio)))

    for trip_id, trip_data in tqdm(df_route.groupby('trip_id'), desc="Creating sequences"):
        trip_data = trip_data.sort_values('time_stamp').reset_index(drop=True)

        # Get features and labels
        features = trip_data[BASE_COLUMNS].fillna(0).values
        trip_labels = trip_data['y_true'].values

        # Create overlapping sequences
        for start_idx in range(0, len(features) - sequence_length + 1, step_size):
            end_idx = start_idx + sequence_length
            seq_features = features[start_idx:end_idx]
            seq_labels = trip_labels[start_idx:end_idx]

            # Label sequence as anomalous if any point in sequence is anomalous
            sequence_label = int(np.any(seq_labels == 1))

            sequences.append(seq_features)
            labels.append(sequence_label)
            trip_ids.append(trip_id)

    return np.array(sequences), np.array(labels), np.array(trip_ids)

-───────────────────────────── LSTM Model Definitions ──────────────────────────────

In [5]:
def build_lstm_autoencoder(input_shape: Tuple[int, int], lstm_units: int = 64,
                          dense_units: int = 32, dropout_rate: float = 0.2) -> Model:
    """
    Build LSTM Autoencoder for anomaly detection.
    """
    # Encoder
    input_layer = Input(shape=input_shape)
    encoded = LSTM(lstm_units, return_sequences=True)(input_layer)
    encoded = Dropout(dropout_rate)(encoded)
    encoded = LSTM(lstm_units // 2, return_sequences=False)(encoded)
    encoded = Dropout(dropout_rate)(encoded)

    # Bottleneck
    bottleneck = Dense(dense_units, activation='relu')(encoded)
    bottleneck = Dropout(dropout_rate)(bottleneck)

    # Decoder
    decoded = Dense(lstm_units // 2, activation='relu')(bottleneck)
    decoded = Dropout(dropout_rate)(decoded)
    decoded = tf.keras.layers.RepeatVector(input_shape[0])(decoded)
    decoded = LSTM(lstm_units // 2, return_sequences=True)(decoded)
    decoded = Dropout(dropout_rate)(decoded)
    decoded = LSTM(lstm_units, return_sequences=True)(decoded)
    decoded = TimeDistributed(Dense(input_shape[1]))(decoded)

    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [6]:
def build_lstm_classifier(input_shape: Tuple[int, int], lstm_units: int = 64,
                         dense_units: int = 32, dropout_rate: float = 0.2) -> Model:
    """
    Build LSTM Classifier for anomaly detection.
    """
    model = Sequential([
        LSTM(lstm_units, return_sequences=True, input_shape=input_shape),
        Dropout(dropout_rate),
        LSTM(lstm_units // 2, return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    return model

In [7]:
def train_lstm_autoencoder(X_train_normal: np.ndarray, X_val: np.ndarray,
                          model_path: str) -> Tuple[Model, Dict]:
    """
    Train LSTM Autoencoder on normal data only.
    """
    input_shape = (X_train_normal.shape[1], X_train_normal.shape[2])
    model = build_lstm_autoencoder(input_shape, LSTM_UNITS, DENSE_UNITS, DROPOUT_RATE)

    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss='mse', metrics=['mae'])

    callbacks = [
        EarlyStopping(patience=PATIENCE, restore_best_weights=True, monitor='val_loss'),
        ModelCheckpoint(model_path.replace('.h5', '.keras'), save_best_only=True, monitor='val_loss'),
        ReduceLROnPlateau(factor=0.5, patience=PATIENCE//2, min_lr=1e-6, monitor='val_loss')
    ]

    history = model.fit(
        X_train_normal, X_train_normal,  # Autoencoder: input = output
        validation_data=(X_val, X_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )

    return model, history.history

In [8]:
def train_lstm_classifier(X_train: np.ndarray, y_train: np.ndarray,
                         X_val: np.ndarray, y_val: np.ndarray,
                         model_path: str) -> Tuple[Model, Dict]:
    """
    Train LSTM Classifier for binary anomaly classification.
    """
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = build_lstm_classifier(input_shape, LSTM_UNITS, DENSE_UNITS, DROPOUT_RATE)

    # Handle class imbalance
    class_weight = {
        0: len(y_train) / (2 * np.sum(y_train == 0)),
        1: len(y_train) / (2 * np.sum(y_train == 1))
    }

    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'precision', 'recall'])

    callbacks = [
        EarlyStopping(patience=PATIENCE, restore_best_weights=True, monitor='val_loss'),
        ModelCheckpoint(model_path.replace('.h5', '.keras'), save_best_only=True, monitor='val_loss'),
        ReduceLROnPlateau(factor=0.5, patience=PATIENCE//2, min_lr=1e-6, monitor='val_loss')
    ]

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        class_weight=class_weight,
        verbose=1
    )

    return model, history.history

In [9]:
def evaluate_model(model: Model, X_test: np.ndarray, y_test: np.ndarray,
                  model_type: str = 'classifier') -> Dict:
    """
    Evaluate trained model and return metrics.
    """
    if model_type == 'autoencoder':
        # For autoencoder, compute reconstruction error
        X_pred = model.predict(X_test, verbose=0)
        reconstruction_errors = np.mean(np.square(X_test - X_pred), axis=(1, 2))

        # Use threshold to classify anomalies (e.g., 95th percentile of normal data errors)
        # This would typically be computed on validation set
        threshold = np.percentile(reconstruction_errors, 95)
        y_pred = (reconstruction_errors > threshold).astype(int)
        y_scores = reconstruction_errors

    else:  # classifier
        y_pred_prob = model.predict(X_test, verbose=0).flatten()
        y_pred = (y_pred_prob > 0.5).astype(int)
        y_scores = y_pred_prob

    # Calculate metrics
    accuracy = np.mean(y_pred == y_test)

    if len(np.unique(y_test)) > 1:  # Check if both classes are present
        auc_score = roc_auc_score(y_test, y_scores)
        class_report = classification_report(y_test, y_pred, output_dict=True)
        conf_matrix = confusion_matrix(y_test, y_pred)
    else:
        auc_score = 0.0
        class_report = {}
        conf_matrix = np.array([[]])

    return {
        'accuracy': accuracy,
        'auc_score': auc_score,
        'classification_report': class_report,
        'confusion_matrix': conf_matrix,
        'y_pred': y_pred,
        'y_scores': y_scores
    }

───────────────────────────── Main Training Pipeline ──────────────────────────────


In [10]:
def train_lstm_pipeline(data_path: str, output_dir: str, model_type: str = 'classifier'):
    """
    Main training pipeline for LSTM-based vessel anomaly detection.

    Args:
        data_path: Path to the parquet file
        output_dir: Directory to save models and results
        model_type: 'classifier' or 'autoencoder'
    """
    print(f"Starting LSTM {model_type} training pipeline...")

    # Load and prepare data
    df = pd.read_parquet(data_path)
    Path(output_dir).mkdir(exist_ok=True)

    results = {}

    for route in df.route_id.unique():
        print(f"\n=== Training LSTM {model_type} for route: {route} ===")
        t0 = time.time()

        # Prepare route data
        fr =  df[df.route_id == route].copy()

        # Create sequences
        sequences, labels, trip_ids = create_sequences_from_trips(fr, SEQUENCE_LENGTH, OVERLAP_RATIO)

        if len(sequences) == 0:
            print(f"  * No sequences created for route {route}, skipping.")
            continue

        print(f"  * Created {len(sequences)} sequences")
        print(f"  * Normal sequences: {np.sum(labels == 0)}, Anomalous sequences: {np.sum(labels == 1)}")

        # Scale features
        scaler = MinMaxScaler()
        n_samples, n_timesteps, n_features = sequences.shape
        sequences_scaled = scaler.fit_transform(sequences.reshape(-1, n_features)).reshape(n_samples, n_timesteps, n_features)

        # Split data
        unique_trips = np.unique(trip_ids)
        n_train_trips = int(0.7 * len(unique_trips))
        n_val_trips = int(0.15 * len(unique_trips))

        np.random.shuffle(unique_trips)
        train_trips = unique_trips[:n_train_trips]
        val_trips = unique_trips[n_train_trips:n_train_trips + n_val_trips]
        test_trips = unique_trips[n_train_trips + n_val_trips:]

        train_mask = np.isin(trip_ids, train_trips)
        val_mask = np.isin(trip_ids, val_trips)
        test_mask = np.isin(trip_ids, test_trips)

        X_train, y_train = sequences_scaled[train_mask], labels[train_mask]
        X_val, y_val = sequences_scaled[val_mask], labels[val_mask]
        X_test, y_test = sequences_scaled[test_mask], labels[test_mask]

        # Train model
        model_path = f"{output_dir}/lstm_{model_type}_{route}.h5"

        if model_type == 'autoencoder':
            # Train only on normal sequences for autoencoder
            X_train_normal = X_train[y_train == 0]
            X_val_normal = X_val[y_val == 0]

            if len(X_train_normal) == 0:
                print(f"  * No normal training sequences for route {route}, skipping.")
                continue

            model, history = train_lstm_autoencoder(X_train_normal, X_val_normal, model_path)
        else:
            if len(np.unique(y_train)) < 2:
                print(f"  * Insufficient class diversity for route {route}, skipping.")
                continue

            model, history = train_lstm_classifier(X_train, y_train, X_val, y_val, model_path)

        # Evaluate model
        test_results = evaluate_model(model, X_test, y_test, model_type)

        # Save results
        route_results = {
            'route': route,
            'model_type': model_type,
            'training_time': time.time() - t0,
            'n_sequences': len(sequences),
            'n_train': len(X_train),
            'n_val': len(X_val),
            'n_test': len(X_test),
            'test_metrics': test_results,
            'training_history': history
        }

        results[route] = route_results

        # Save scaler
        joblib.dump(scaler, f"{output_dir}/scaler_{route}.pkl")

        print(f"  * Training completed in {route_results['training_time']:.1f}s")
        print(f"  * Test Accuracy: {test_results['accuracy']:.4f}")
        print(f"  * Test AUC: {test_results['auc_score']:.4f}")

    # Save overall results
    joblib.dump(results, f"{output_dir}/training_results.pkl")
    print(f"\nTraining pipeline completed. Results saved to {output_dir}")

    return results


-───────────────────────────── LSTM Training Start ──────────────────────────────

In [11]:
# LSTM-specific parameters
SEQUENCE_LENGTH = 50  # Number of time steps to look back
OVERLAP_RATIO = 0.5   # Overlap between sequences (0.5 = 50% overlap)
LSTM_UNITS = 64       # Number of LSTM units
DENSE_UNITS = 32      # Dense layer units
DROPOUT_RATE = 0.2    # Dropout rate
BATCH_SIZE = 32       # Training batch size
PATIENCE = 15         # Early stopping patience
LEARNING_RATE = 0.001 # Learning rate

data_path = "data/LSTM_preprocessed.parquet"
output_dir = "lstm_models"

In [12]:
EPOCHS = 10          # Maximum training epochs

# Train classifier
results_classifier = train_lstm_pipeline(data_path, f"{output_dir}_classifier", f"classifier_{EPOCHS}")

Starting LSTM classifier_10 training pipeline...

=== Training LSTM classifier_10 for route: KIEL ===


Creating sequences:   0%|          | 0/423 [00:00<?, ?it/s]

  * Created 20789 sequences
  * Normal sequences: 20498, Anomalous sequences: 291
Epoch 1/10
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.7851 - loss: 0.6495 - precision: 0.0184 - recall: 0.3457 - val_accuracy: 0.8332 - val_loss: 0.6622 - val_precision: 0.0430 - val_recall: 0.4074 - learning_rate: 0.0010
Epoch 2/10
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.7794 - loss: 0.6227 - precision: 0.0271 - recall: 0.4645 - val_accuracy: 0.9275 - val_loss: 0.5602 - val_precision: 0.0564 - val_recall: 0.2037 - learning_rate: 0.0010
Epoch 3/10
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.8188 - loss: 0.5991 - precision: 0.0351 - recall: 0.4788 - val_accuracy: 0.9300 - val_loss: 0.5380 - val_precision: 0.0588 - val_recall: 0.2037 - learning_rate: 0.0010
Epoch 4/10
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.8290 - l

Creating sequences:   0%|          | 0/702 [00:00<?, ?it/s]

  * Created 14065 sequences
  * Normal sequences: 14003, Anomalous sequences: 62
Epoch 1/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.8473 - loss: 0.7015 - precision: 0.0083 - recall: 0.3328 - val_accuracy: 0.9981 - val_loss: 0.1700 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0010
Epoch 2/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.8150 - loss: 0.7351 - precision: 0.0086 - recall: 0.3812 - val_accuracy: 0.6036 - val_loss: 0.5936 - val_precision: 0.0048 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 3/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.7865 - loss: 0.5130 - precision: 0.0161 - recall: 0.7633 - val_accuracy: 0.6737 - val_loss: 0.5172 - val_precision: 0.0058 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 4/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.707

In [13]:
EPOCHS = 10          # Maximum training epochs
# Train autoencoder
results_autoencoder = train_lstm_pipeline(data_path, f"{output_dir}_autoencoder", f"autoencoder_{EPOCHS}")

Starting LSTM autoencoder_10 training pipeline...

=== Training LSTM autoencoder_10 for route: KIEL ===


Creating sequences:   0%|          | 0/423 [00:00<?, ?it/s]

  * Created 20789 sequences
  * Normal sequences: 20498, Anomalous sequences: 291
Epoch 1/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.7367 - loss: 0.6865 - precision: 0.0215 - recall: 0.3909 - val_accuracy: 0.3652 - val_loss: 0.7651 - val_precision: 0.0152 - val_recall: 0.8108 - learning_rate: 0.0010
Epoch 2/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.6685 - loss: 0.6352 - precision: 0.0273 - recall: 0.5690 - val_accuracy: 0.4899 - val_loss: 0.7507 - val_precision: 0.0165 - val_recall: 0.7027 - learning_rate: 0.0010
Epoch 3/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7257 - loss: 0.6167 - precision: 0.0323 - recall: 0.5625 - val_accuracy: 0.5330 - val_loss: 0.7371 - val_precision: 0.0193 - val_recall: 0.7568 - learning_rate: 0.0010
Epoch 4/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.7664 - 

Creating sequences:   0%|          | 0/702 [00:00<?, ?it/s]

  * Created 14065 sequences
  * Normal sequences: 14003, Anomalous sequences: 62
Epoch 1/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.8718 - loss: 0.7639 - precision: 0.0048 - recall: 0.1937 - val_accuracy: 0.6685 - val_loss: 0.5549 - val_precision: 0.0152 - val_recall: 0.7857 - learning_rate: 0.0010
Epoch 2/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.7341 - loss: 0.6350 - precision: 0.0082 - recall: 0.6234 - val_accuracy: 0.5933 - val_loss: 0.7345 - val_precision: 0.0135 - val_recall: 0.8571 - learning_rate: 0.0010
Epoch 3/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.6837 - loss: 0.4931 - precision: 0.0094 - recall: 0.8836 - val_accuracy: 0.6717 - val_loss: 0.5488 - val_precision: 0.0166 - val_recall: 0.8571 - learning_rate: 0.0010
Epoch 4/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.7169 - loss