# LSTM Per-Route Training Pipeline with Dispatcher

This notebook trains separate LSTM models for each route and creates a dispatcher system
compatible with the visualization pipeline. Each route gets its own model optimized for
that specific route's characteristics.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import warnings
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tqdm.auto import tqdm
from typing import Dict

# Import our custom classes
from lstm_autoencoder import LSTMAutoencoder
from lstm_classifier import LSTMClassifier

# Setup
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
tqdm.pandas()

# Set random seeds for reproducibility
RANDOM_STATE = 42
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

print("Setup complete!")

2025-06-23 21:11:46.555355: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-23 21:11:46.620691: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Setup complete!


In [None]:
# print("Configuring GPU...")
#
# gpus = tf.config.experimental.list_physical_devices('GPU')
# print(f"GPUs available: {len(gpus)}")
#
# if gpus:
#     try:
#         # Configure GPU
#         for gpu in gpus:
#             tf.config.experimental.set_memory_growth(gpu, True)
#
#         # Key addition: Force all operations to GPU by default
#         tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
#
#         # Set GPU as default device
#         tf.config.set_soft_device_placement(True)
#
#         print(f"GPU configured: {gpus[0]}")
#
#         # Test that new tensors are created on GPU
#         with tf.device('/GPU:0'):
#             test_tensor = tf.constant([1.0, 2.0, 3.0])
#             print(f"Test tensor device: {test_tensor.device}")
#
#     except RuntimeError as e:
#         print(f"GPU configuration error: {e}")
#
# from lstm_autoencoder import LSTMAutoencoder
# from lstm_classifier import LSTMClassifier

## 2. Configuration Parameters

**Modify these parameters to experiment with different configurations:**

In [3]:
# =============================================================================
# CONFIGURATION PARAMETERS - MODIFY THESE TO EXPERIMENT
# =============================================================================

# Data parameters
DATA_PATH = "data/LSTM_preprocessed.parquet"
EXPERIMENT_NAME = "lstm_per_route_v1"

# Feature columns to use (must match data preprocessing)
FEATURE_COLUMNS = [
    "speed_over_ground", "dv", "dcourse", "ddraft",
    "zone", "x_km", "y_km", "dist_to_ref", "route_dummy"
]

# Sequence parameters
SEQUENCE_LENGTH = 50        # Number of time steps in each sequence TODO
OVERLAP_RATIO = 0.5         # Overlap between consecutive sequences (0.5 = 50%) TODO

# Model architecture parameters
LSTM_UNITS = 64             # Number of LSTM units
DENSE_UNITS = 32            # Dense layer units
DROPOUT_RATE = 0.1          # Dropout rate for regularization

# Training parameters
EPOCHS = 50                 # Maximum epochs (early stopping will likely kick in)
BATCH_SIZE = 32             # Training batch size
PATIENCE = 5               # Early stopping patience
VALIDATION_SIZE = 0.2       # Proportion of data for validation

# Model selection
TRAIN_AUTOENCODER = True    # Train LSTM Autoencoder per route
TRAIN_CLASSIFIER = True     # Train LSTM Classifier per route

# Evaluation parameters
AUTOENCODER_THRESHOLD_PERCENTILE = 95  # Threshold for autoencoder (95th percentile)
CLASSIFIER_THRESHOLD = 0.5             # Threshold for classifier

# Output directories (following OC-SVM pattern)
OUTPUT_DIR_AE = "models_per_route_lstm_ae"    # Autoencoder models
OUTPUT_DIR_CLF = "models_per_route_lstm_clf"  # Classifier models

print(f"Configuration loaded for experiment: {EXPERIMENT_NAME}")
print(f"Sequence length: {SEQUENCE_LENGTH}, LSTM units: {LSTM_UNITS}")
print(f"Training epochs: {EPOCHS}, Batch size: {BATCH_SIZE}")
print(f"Output directories: {OUTPUT_DIR_AE}, {OUTPUT_DIR_CLF}")

Configuration loaded for experiment: lstm_per_route_v1
Sequence length: 50, LSTM units: 64
Training epochs: 50, Batch size: 32
Output directories: models_per_route_lstm_ae, models_per_route_lstm_clf


## 3. Data Loading and Route Analysis

In [4]:
# Load preprocessed data
print(f"Loading data from {DATA_PATH}...")
df = pd.read_parquet(DATA_PATH)
print(f"Loaded {len(df):,} rows with {df['trip_id'].nunique()} unique trips")

# Check for required columns
missing_cols = [col for col in FEATURE_COLUMNS if col not in df.columns]
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}")
    FEATURE_COLUMNS = [col for col in FEATURE_COLUMNS if col in df.columns]
    print(f"Using available columns: {FEATURE_COLUMNS}")

# Analyze routes
route_summary = df.groupby('route_id').agg({
    'trip_id': 'nunique',
    'y_true': ['sum', 'count']
}).round(2)

route_summary.columns = ['trips', 'anomalies', 'total_points']
route_summary['anomaly_rate'] = (route_summary['anomalies'] / route_summary['total_points'] * 100).round(2)

print("\nRoute Analysis:")
print(route_summary)

routes = df.route_id.unique()
print(f"\nWill train models for {len(routes)} routes: {list(routes)}")

Loading data from data/LSTM_preprocessed.parquet...
Loaded 911,989 rows with 1122 unique trips

Route Analysis:
             trips  anomalies  total_points  anomaly_rate
route_id                                                 
BREMERHAVEN    702      621.0         25791          2.41
KIEL           420     3505.0         80407          4.36

Will train models for 2 routes: ['KIEL', 'BREMERHAVEN']


## 4. Sequence Creation Functions

In [5]:
def create_sequences_for_route(df_route: pd.DataFrame, sequence_length: int, 
                              overlap_ratio: float, feature_columns: list) -> tuple:
    """
    Create sequences from route data, ensuring proper temporal ordering.
    
    Args:
        df_route: DataFrame containing data for a single route
        sequence_length: Length of each sequence
        overlap_ratio: Overlap between consecutive sequences
        feature_columns: List of feature column names
        
    Returns:
        tuple: (sequences, labels, trip_ids)
    """
    sequences = []
    labels = []
    trip_ids = []
    step_size = max(1, int(sequence_length * (1 - overlap_ratio)))
    
    for trip_id, trip_data in df_route.groupby('trip_id'):
        trip_data = trip_data.sort_values('time_stamp').reset_index(drop=True)
            
        features = trip_data[feature_columns].fillna(0).values
        trip_labels = trip_data['y_true'].values
        
        # Create overlapping sequences
        for start_idx in range(0, len(features) - sequence_length + 1, step_size):
            end_idx = start_idx + sequence_length
            seq_features = features[start_idx:end_idx]
            seq_labels = trip_labels[start_idx:end_idx]
            
            # Label sequence as anomalous if any point is anomalous
            sequence_label = int(np.any(seq_labels == 1))
            
            sequences.append(seq_features)
            labels.append(sequence_label)
            trip_ids.append(trip_id)
    
    return np.array(sequences), np.array(labels), np.array(trip_ids)

def balance_sequences(X: np.ndarray, y: np.ndarray) -> tuple:
    """
    Balance training data by undersampling normal sequences.
    
    Args:
        X: Sequences array
        y: Labels array
        
    Returns:
        tuple: (balanced_X, balanced_y)
    """
    normal_indices = np.where(y == 0)[0]
    anomaly_indices = np.where(y == 1)[0]
    
    if len(anomaly_indices) == 0:
        return X, y  # No anomalies to balance with
    
    if len(normal_indices) > len(anomaly_indices):
        # Undersample normal to match anomalous count
        np.random.seed(RANDOM_STATE)
        selected_normal = np.random.choice(normal_indices, len(anomaly_indices), replace=False)
        balanced_indices = np.concatenate([selected_normal, anomaly_indices])
        np.random.shuffle(balanced_indices)
        
        return X[balanced_indices], y[balanced_indices]
    
    return X, y

print("Sequence creation functions defined.")

Sequence creation functions defined.


## 5. LSTM Autoencoder Per-Route Training

In [6]:
if TRAIN_AUTOENCODER:
    print("=" * 80)
    print("TRAINING LSTM AUTOENCODER PER ROUTE")
    print("=" * 80)
    
    # Create output directory
    Path(OUTPUT_DIR_AE).mkdir(exist_ok=True)
    dispatcher_ae: Dict[str, str] = {}
    
    for route in routes:
        t0 = time.time()
        print(f"\n=== Training Autoencoder for Route: {route} ===")
        
        # Filter data for this route
        df_route = df[df.route_id == route].copy()
        
        # Create sequences for this route
        X, y, trip_ids = create_sequences_for_route(
            df_route, SEQUENCE_LENGTH, OVERLAP_RATIO, FEATURE_COLUMNS
        )
        
        if len(X) == 0:
            print(f"  * No sequences created for route {route}, skipping.")
            continue
            
        print(f"  * Created {len(X):,} sequences")
        print(f"  * Normal: {np.sum(y == 0):,}, Anomalous: {np.sum(y == 1):,}")
        
        # Split data for this route (preventing trip leakage)
        unique_trips = np.unique(trip_ids)
        train_trips, val_trips = train_test_split(
            unique_trips, test_size=VALIDATION_SIZE, random_state=RANDOM_STATE
        )
        
        train_mask = np.isin(trip_ids, train_trips)
        val_mask = np.isin(trip_ids, val_trips)
        
        X_train, y_train = X[train_mask], y[train_mask]
        X_val, y_val = X[val_mask], y[val_mask]
        
        # For autoencoder, use only normal data for training
        X_train_normal = X_train[y_train == 0]
        X_val_normal = X_val[y_val == 0]
        
        if len(X_train_normal) < 10:  # Need minimum data
            print(f"  * Insufficient normal training data for route {route}, skipping.")
            continue
            
        print(f"  * Training on {len(X_train_normal):,} normal sequences")
        print(f"  * Validating on {len(X_val_normal):,} normal sequences")
        
        # Initialize autoencoder
        autoencoder = LSTMAutoencoder(
            sequence_length=SEQUENCE_LENGTH,
            n_features=len(FEATURE_COLUMNS),
            lstm_units=LSTM_UNITS,
            dense_units=DENSE_UNITS,
            dropout_rate=DROPOUT_RATE
        )
        
        # Train the model
        try:
            history = autoencoder.fit(
                X_train_normal,
                validation_data=(X_val_normal, y_val[y_val == 0]),
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                patience=PATIENCE
            )
            
            # Evaluate and set threshold
            reconstruction_errors, _ = autoencoder.predict(X_val)
            threshold = np.percentile(reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)
            
            # Evaluate model
            metrics = autoencoder.evaluate(X_val, y_val, threshold=threshold)
            
            print(f"  * Threshold: {threshold:.4f}")
            print(f"  * Precision: {metrics['precision']:.3f}")
            print(f"  * Recall: {metrics['recall']:.3f}")
            print(f"  * F1 Score: {metrics['f1_score']:.3f}")
            
            # Save model artifacts (compatible with visualizer)
            model_filename = f"lstm_autoencoder_{route}.pkl"
            model_path = Path(OUTPUT_DIR_AE) / model_filename
            
            model_artifacts = {
                "model": autoencoder,           # The trained model
                "features": FEATURE_COLUMNS,   # Feature names
                "threshold": threshold,        # Anomaly threshold (tau)
                "sequence_length": SEQUENCE_LENGTH,
                "model_type": "autoencoder",
                "metrics": metrics,
                "route": route
            }
            
            joblib.dump(model_artifacts, model_path)
            dispatcher_ae[route] = str(model_path)
            
            print(f"  * Route {route} completed in {time.time() - t0:.1f}s")
            
        except Exception as e:
            print(f"  * Error training route {route}: {e}")
            continue
    
    # Save dispatcher
    dispatcher_path_ae = Path(OUTPUT_DIR_AE) / "dispatcher.pkl"
    joblib.dump(dispatcher_ae, dispatcher_path_ae)
    
    print(f"\nAutoencoder training complete!")
    print(f" Models saved to: {OUTPUT_DIR_AE}")
    print(f"  Dispatcher saved: {dispatcher_path_ae}")
    print(f" Routes trained: {list(dispatcher_ae.keys())}")

else:
    print("Skipping autoencoder training (TRAIN_AUTOENCODER=False)")
    dispatcher_ae = {}

TRAINING LSTM AUTOENCODER PER ROUTE

=== Training Autoencoder for Route: KIEL ===
  * Created 20,752 sequences
  * Normal: 20,350, Anomalous: 402
  * Training on 16,133 normal sequences
  * Validating on 4,217 normal sequences
Epoch 1/50


2025-06-23 21:12:16.588852: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT32 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 107ms/step - accuracy: 0.7653 - auc: 0.8116 - loss: 0.0098 - precision: 0.9921 - recall: 0.2473 - val_accuracy: 0.9591 - val_auc: 0.9001 - val_loss: 7.8305e-04 - val_precision: 1.0000 - val_recall: 0.2443
Epoch 2/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 101ms/step - accuracy: 0.9260 - auc: 0.8637 - loss: 0.0012 - precision: 1.0000 - recall: 0.2571 - val_accuracy: 0.9422 - val_auc: 0.8748 - val_loss: 5.0068e-04 - val_precision: 1.0000 - val_recall: 0.2443
Epoch 3/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 102ms/step - accuracy: 0.9348 - auc: 0.8729 - loss: 7.5510e-04 - precision: 1.0000 - recall: 0.2589 - val_accuracy: 0.9569 - val_auc: 0.8640 - val_loss: 3.4020e-04 - val_precision: 1.0000 - val_recall: 0.2387
Epoch 4/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 114ms/step - accuracy: 0.9422 - auc: 0.8788 - loss: 6.3419e-04 - precision: 1.00

## 6. LSTM Classifier Per-Route Training

In [None]:
# if TRAIN_CLASSIFIER:
#     print("=" * 80)
#     print("TRAINING LSTM CLASSIFIER PER ROUTE")
#     print("=" * 80)
#
#     # Create output directory
#     Path(OUTPUT_DIR_CLF).mkdir(exist_ok=True)
#     dispatcher_clf: Dict[str, str] = {}
#
#     for route in routes:
#         t0 = time.time()
#         print(f"\n=== Training Classifier for Route: {route} ===")
#
#         # Filter data for this route
#         df_route = df[df.route_id == route].copy()
#
#         # Create sequences for this route
#         X, y, trip_ids = create_sequences_for_route(
#             df_route, SEQUENCE_LENGTH, OVERLAP_RATIO, FEATURE_COLUMNS
#         )
#
#         if len(X) == 0:
#             print(f"  * No sequences created for route {route}, skipping.")
#             continue
#
#         print(f"  * Created {len(X):,} sequences")
#         print(f"  * Normal: {np.sum(y == 0):,}, Anomalous: {np.sum(y == 1):,}")
#
#         # Check for class diversity
#         if len(np.unique(y)) < 2:
#             print(f"  * Insufficient class diversity for route {route}, skipping.")
#             continue
#
#         # Split data for this route (preventing trip leakage)
#         unique_trips = np.unique(trip_ids)
#
#         # Get trip-level labels for stratification
#         trip_labels = []
#         for trip in unique_trips:
#             trip_mask = trip_ids == trip
#             trip_has_anomaly = np.any(y[trip_mask] == 1)
#             trip_labels.append(int(trip_has_anomaly))
#
#         trip_labels = np.array(trip_labels)
#
#         # Stratified split by trip
#         if len(np.unique(trip_labels)) > 1:
#             train_trips, val_trips = train_test_split(
#                 unique_trips, test_size=VALIDATION_SIZE,
#                 stratify=trip_labels, random_state=RANDOM_STATE
#             )
#         else:
#             train_trips, val_trips = train_test_split(
#                 unique_trips, test_size=VALIDATION_SIZE, random_state=RANDOM_STATE
#             )
#
#         train_mask = np.isin(trip_ids, train_trips)
#         val_mask = np.isin(trip_ids, val_trips)
#
#         X_train, y_train = X[train_mask], y[train_mask]
#         X_val, y_val = X[val_mask], y[val_mask]
#
#         # Balance training data
#         X_train, y_train = balance_sequences(X_train, y_train)
#
#         if len(X_train) < 10:  # Need minimum data
#             print(f"  * Insufficient training data for route {route}, skipping.")
#             continue
#
#         print(f"  * Training on {len(X_train):,} sequences (balanced)")
#         print(f"  * Validating on {len(X_val):,} sequences")
#         print(f"  * Train anomaly rate: {np.mean(y_train == 1):.1%}")
#
#         # Initialize classifier
#         classifier = LSTMClassifier(
#             sequence_length=SEQUENCE_LENGTH,
#             n_features=len(FEATURE_COLUMNS),
#             lstm_units=LSTM_UNITS,
#             dense_units=DENSE_UNITS,
#             dropout_rate=DROPOUT_RATE
#         )
#
#         # Train the model
#         try:
#             history = classifier.fit(
#                 X_train, y_train,
#                 validation_data=(X_val, y_val),
#                 epochs=EPOCHS,
#                 batch_size=BATCH_SIZE,
#                 patience=PATIENCE
#             )
#
#             # Evaluate model
#             metrics = classifier.evaluate(X_val, y_val, threshold=CLASSIFIER_THRESHOLD)
#
#             print(f"  * Threshold: {CLASSIFIER_THRESHOLD:.3f}")
#             print(f"  * Accuracy: {metrics['accuracy']:.3f}")
#             print(f"  * Precision: {metrics['precision']:.3f}")
#             print(f"  * Recall: {metrics['recall']:.3f}")
#             print(f"  * F1 Score: {metrics['f1_score']:.3f}")
#
#             # Save model artifacts (compatible with visualizer)
#             model_filename = f"lstm_classifier_{route}.pkl"
#             model_path = Path(OUTPUT_DIR_CLF) / model_filename
#
#             model_artifacts = {
#                 "model": classifier,           # The trained model
#                 "features": FEATURE_COLUMNS,   # Feature names
#                 "threshold": CLASSIFIER_THRESHOLD,  # Classification threshold (tau)
#                 "sequence_length": SEQUENCE_LENGTH,
#                 "model_type": "classifier",
#                 "metrics": metrics,
#                 "route": route
#             }
#
#             joblib.dump(model_artifacts, model_path)
#             dispatcher_clf[route] = str(model_path)
#
#             print(f"  * Route {route} completed in {time.time() - t0:.1f}s")
#
#         except Exception as e:
#             print(f"  * Error training route {route}: {e}")
#             continue
#
#     # Save dispatcher
#     dispatcher_path_clf = Path(OUTPUT_DIR_CLF) / "dispatcher.pkl"
#     joblib.dump(dispatcher_clf, dispatcher_path_clf)
#
#     print(f"\n Classifier training complete!")
#     print(f" Models saved to: {OUTPUT_DIR_CLF}")
#     print(f"️  Dispatcher saved: {dispatcher_path_clf}")
#     print(f" Routes trained: {list(dispatcher_clf.keys())}")
#
# else:
#     print("Skipping classifier training (TRAIN_CLASSIFIER=False)")
#     dispatcher_clf = {}

## 7. Training Summary and Model Testing

In [None]:
print("=" * 80)
print("TRAINING SUMMARY")
print("=" * 80)

print(f"\nExperiment: {EXPERIMENT_NAME}")
print(f"Total routes: {len(routes)}")

if dispatcher_ae:
    print(f"\nAutoencoder models trained: {len(dispatcher_ae)}")
    print(f"Routes: {list(dispatcher_ae.keys())}")
    print(f"Dispatcher: {OUTPUT_DIR_AE}/dispatcher.pkl")

# if dispatcher_clf:
#     print(f"\nClassifier models trained: {len(dispatcher_clf)}")
#     print(f"Routes: {list(dispatcher_clf.keys())}")
#     print(f"Dispatcher: {OUTPUT_DIR_CLF}/dispatcher.pkl")

# Test loading a model (to verify compatibility)
if dispatcher_ae:
    test_route = list(dispatcher_ae.keys())[0]
    print(f"\n=== Testing Model Loading for Route: {test_route} ===")
    
    try:
        # Load model artifacts
        artifacts = joblib.load(dispatcher_ae[test_route])
        
        print(f"== Successfully loaded {artifacts['model_type']} for route {test_route}")
        print(f"   Features: {len(artifacts['features'])}")
        print(f"   Threshold: {artifacts['threshold']:.4f}")
        print(f"   Sequence length: {artifacts['sequence_length']}")
        
        # Test prediction (with dummy data)
        dummy_X = np.random.random((1, SEQUENCE_LENGTH, len(FEATURE_COLUMNS)))
        model = artifacts['model']
        scores, preds = model.predict(dummy_X)
        
        print(f"== Prediction test successful: score={scores[0]:.4f}, pred={preds[0]}")
        
    except Exception as e:
        print(f"❌ Error testing model: {e}")