In [None]:
import torch
from torch import nn
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import sklearn.metrics
from sklearn.metrics import precision_score, f1_score, matthews_corrcoef
from sklearn.metrics import recall_score

: 

# LSTM model

In [None]:
class LSTMModel(nn.Module):
    """
    A PyTorch implementation of a Long Short-Term Memory (LSTM) model for time-series forecasting.
    """

    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers,
                 output_size,
                 dropout=0):

        super(LSTMModel, self).__init__()

        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.decoder = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.output_layer = nn.Linear(hidden_size, input_size)

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.dropout = dropout

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encode
        _, (hidden, cell) = self.encoder(x)

        # Prepare decoder input: repeat the encoded representation for each timestep
        # Use the last hidden state as the encoded representation
        encoded = hidden[-1].unsqueeze(1).repeat(1, seq_len, 1)  # Shape: (batch, seq_len, hidden_size)

        # Decode: Reconstruct the sequence
        decoder_output, _ = self.decoder(encoded, (hidden, cell))

        # Reconstruct
        reconstructed = self.output_layer(decoder_output)

        return reconstructed

    def get_reconstruction_error(self, x):
        """Calculate reconstruction error for anomaly detection"""
        with torch.no_grad():
            reconstructed = self.forward(x)
            # Calculate MSE for each sequence
            mse = torch.mean((x - reconstructed) ** 2, dim=(1, 2)) #TODO check proper loss
            return mse.cpu().numpy()

    def encode(self, x):
        """Get the encoded representation of input sequences"""
        with torch.no_grad():
            _, (hidden, _) = self.encoder(x)
            return hidden[-1]  # Return the last layer's hidden state


# Configuration

In [None]:
DATA_PATH = "LSTM_preprocessed.parquet"

OUTPUT_DIR_AE = "models_per_route_lstm_ae"

In [None]:
FEATURE_COLUMNS = [ "speed_over_ground", "course_over_ground",
                    # "longitude", "latitude"
                    "x_km", "y_km"
                    ]

SEQUENCE_LENGTH = 15
SEQUENCE_STEP_LENGTH = 7

EPOCHS = 40
BATCH_SIZE = 32
VALIDATION_SIZE = 0.2

HIDDEN_SIZE = 32
NUM_LAYERS = 2

AUTOENCODER_THRESHOLD_PERCENTILE = 95

# Utils data

In [None]:
class Dataset:

    def __init__(self, df):
        self.df = df
        self.scaler = MinMaxScaler()


    def get_valid_trips(self, df):
        """
        Get trips that were actually labeled
        """

        df = df[df[FEATURE_COLUMNS].notna().all(axis=1) & df["y_true"].notna()]
        df = df.sort_values(['trip_id', 'time_stamp'])

        features_per_trip = df.groupby('trip_id')[FEATURE_COLUMNS].apply(lambda x: x.values.tolist()).reset_index()
        features_per_trip.columns = ["trip_id", "features"]

        labels_per_trip = df.groupby('trip_id')['y_true'].apply(list).reset_index()
        df = features_per_trip.merge(labels_per_trip, on='trip_id')
        return df


    def scale_sequences(self, sequences, fit=False):
        print(f"Scaling sequences with shape: {sequences.shape}")
        if fit:
            self.scaler.fit(sequences.reshape(-1, sequences.shape[-1]))

        transformed_sequences = self.scaler.transform(sequences.reshape(-1, sequences.shape[-1]))
        return transformed_sequences.reshape(sequences.shape)

    def create_sequences(self, df, seq_length):
        xs, ys, trip_ids = [], [], []
        for trip_id, group in df.groupby('trip_id'):
            features = np.concat(group["features"].values)
            labels = np.concat(group["y_true"].values)


            for i in range(0, len(labels) - seq_length - 1, SEQUENCE_STEP_LENGTH):
                seq_features = features[i:i+seq_length]
                seq_labels = labels[i:i+seq_length]

                if np.all(seq_labels > 0):
                    seq_label = 1
                else:
                    seq_label = 0

                # seq_label = int(np.sum(seq_labels) > (len(seq_labels) / 2))


                xs.append(seq_features)
                ys.append(seq_label)
                trip_ids.append(trip_id)
        return np.array(xs), np.array(ys), np.array(trip_ids)

    def separate_sequences_by_anomaly_type(self, sequences, labels):
        normal_sequences = sequences[labels == 0]
        anomaly_sequences = sequences[labels == 1]
        return normal_sequences, anomaly_sequences

    def get_loader(self, sequences, labels, batch_size, shuffle=True):
        sequences_tensor = torch.tensor(sequences, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        dataset = TensorDataset(sequences_tensor, labels_tensor)
        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    def preprocess(self, df):
        df = self.get_valid_trips(df)
        sequences, labels, trip_ids = self.create_sequences(df, SEQUENCE_LENGTH)
        unique_trip_ids = np.unique(trip_ids)
        train_trips, val_trips = train_test_split(unique_trip_ids, test_size=VALIDATION_SIZE, random_state=42)

        train_X = sequences[np.isin(trip_ids, train_trips) & (labels == 0)]
        train_y = labels[np.isin(trip_ids, train_trips) & (labels == 0)]

        val_X = sequences[np.isin(trip_ids, val_trips) & (labels == 0)]
        val_y = labels[np.isin(trip_ids, val_trips) & (labels == 0)]

        anomaly_X = sequences[labels == 1]
        anomaly_labels = labels[labels == 1]

        train_X = self.scale_sequences(train_X, fit=True)
        val_X = self.scale_sequences(val_X)
        anomaly_X = self.scale_sequences(anomaly_X)

        print(f"Training normal sequences: {len(train_X)}")
        print(f"Validation normal sequences: {len(val_X)}")
        print(f"Anomaly sequences: {len(anomaly_X)}")

        train_loader = self.get_loader(train_X, train_y, BATCH_SIZE)
        val_loader = self.get_loader(val_X, val_y, BATCH_SIZE, shuffle=False)
        anomaly_loader = self.get_loader(anomaly_X, anomaly_labels, BATCH_SIZE, shuffle=False)

        return train_loader, val_loader, anomaly_loader

In [None]:
def load_data_route(route, datapath=DATA_PATH):
    """
    Load and prepare data for a specific route using new logic
    """
    df = pd.read_parquet(datapath)
    df.sort_values(['trip_id', 'time_stamp'], inplace=True)
    df_route = df[df['start_port'] == route].copy()

    print(f"Loaded {len(df_route)} data points for route {route}")
    print(f"Number of unique trips: {len(df_route['trip_id'].unique())}")

    return df_route

# Training utils

In [None]:
def verbous_metrics(val_labels, preds):
    m = get_all_metrics(val_labels,preds)
    verbose_metrics(m)

def get_all_metrics(true_labels, predictions):

    precision = precision_score(true_labels, predictions, zero_division=0)
    recall = recall_score(true_labels, predictions, zero_division=0)
    f1 = f1_score(true_labels, predictions, zero_division=0)

    tn = np.sum((true_labels == 0) & (predictions == 0))
    fp = np.sum((true_labels == 0) & (predictions == 1))
    fn = np.sum((true_labels == 1) & (predictions == 0))
    tp = np.sum((true_labels == 1) & (predictions == 1))

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    mcc = matthews_corrcoef(true_labels, predictions)

    metrics = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'specificity': specificity,
        'true_positives': tp,
        'true_negatives': tn,
        'false_positives': fp,
        'false_negatives': fn,
        'mcc': mcc,
        'unlabeled_sequences': np.sum(true_labels == -1),
    }
    return metrics

def verbose_metrics(metrics):
    print("Evaluation Metrics:")
    for key, value in metrics.items():
        if (value != 0 and value != 1):
            print(f"{key.replace('_', ' ').title()}: {value:.4f}")


In [None]:
def validation(model,
               val_loader,
               loss,
               device,
               threshold = 0,
               validate = True
               ):

    errors = []
    val_labels = []

    with torch.no_grad():
        for val_X, val_y in val_loader:  # Only normal samples
            val_X = val_X.to(device)
            reconstructed = model(val_X)

            # Save per-sequence max error
            batch_errors = loss(reconstructed, val_X).amax(dim=(1, 2)).cpu().numpy()
            errors.extend(batch_errors)
            val_labels.extend(val_y.cpu().numpy())

    if validate:
        # Calculate threshold based on validation data
        threshold = np.percentile(errors, AUTOENCODER_THRESHOLD_PERCENTILE)
        print(f"Validation threshold: {threshold:.6f}")

    preds = (np.array(errors) > threshold).astype(int)
    verbous_metrics(val_labels,preds)
  
    return errors, preds, threshold

In [None]:
from tqdm import tqdm
import sklearn

def train(model,
          train_loader, val_loader, anomaly_loader,
          optimizer,
          loss,
          device,
          EPOCHS=20):

    for epoch in range(EPOCHS):
        loss_history = []

        # ----- Training Loop -----
        model.train()
        for batch_X, _ in tqdm(train_loader):
            batch_X = batch_X.to(device)

            optimizer.zero_grad()

            noise = torch.randn_like(batch_X) * 0.1 # NEW
            noisy_input = batch_X + noise
            
            out = model(noisy_input)
            batch_loss = loss(out, batch_X).mean()

            loss_history.append(batch_loss.item())
            batch_loss.backward()
            optimizer.step()

        print(f"Training loss: {np.mean(loss_history):.6f}")

        # ----- Validation on NORMAL data (for threshold calculation) -----
        model.eval()
        print("---Val")
        normal_errors, _, threshold = validation(model, val_loader, loss, device)
        print("---Anom")
        anomaly_errors, _, _ = validation(model, anomaly_loader, loss, device, threshold=threshold, validate=False)

    return normal_errors, anomaly_errors

## Plotting

In [None]:
import matplotlib.pyplot as plt

def plot_errors(normal_errors, anomaly_errors):

    plt.hist(normal_errors, bins=100, alpha=0.5, label="Normal")
    plt.hist(anomaly_errors, bins=100, alpha=0.5, label="Anomaly")
    # plt.axvline(thresh[best_idx], color="red", linestyle="--", label="Best Threshold")
    plt.yscale("log")
    plt.legend()
    plt.title("Reconstruction Error Distribution (log scale)")
    plt.xlabel("Max Reconstruction Error")
    plt.ylabel("Log Count")
    plt.grid(True)
    plt.show()

def plot_boxplot(normal_errors, anomaly_errors):
    plt.figure(figsize=(12, 6))

    error_data = [normal_errors, anomaly_errors]
    plt.boxplot(error_data, labels=['Normal', 'Anomaly'])
    plt.ylabel('Reconstruction Error')
    plt.title('Reconstruction Error Distribution')
    plt.grid(True, alpha=0.3)
    plt.show()

# Loading data

## Kiel

In [None]:
df_KIEL = load_data_route("KIEL")
dataset_KIEL = Dataset(df_KIEL)

In [None]:
train_loader_K, val_loader_K, anomaly_loader_K = dataset_KIEL.preprocess(df_KIEL)

## Bremenhaven

In [None]:
df_BREM = load_data_route("BREMERHAVEN")
dataset_BREM = Dataset(df_BREM)

In [None]:
train_loader_B, val_loader_B, anomaly_loader_B = dataset_BREM.preprocess(df_BREM)

# Training

## KIEL

In [None]:
model_K = LSTMModel(
    input_size=len(FEATURE_COLUMNS),
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=len(FEATURE_COLUMNS),
    dropout=0.2
)

optimizer_K = torch.optim.AdamW(model_K.parameters(), lr=0.001, weight_decay=1e-3)
loss_K = nn.MSELoss(reduction='none') 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_K = model_K.to(device)

In [None]:
normal_errors_K, anomaly_errors_K = train(model_K,
        train_loader_K, val_loader_K, anomaly_loader_K,
          optimizer_K,
          loss_K,
          device,
          EPOCHS=20)

In [None]:
plot_errors(normal_errors_K, anomaly_errors_K )
plot_boxplot(normal_errors_K, anomaly_errors_K )

## BREMERHAVEN

In [None]:
model_B = LSTMModel(
    input_size=len(FEATURE_COLUMNS),
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=len(FEATURE_COLUMNS),
    dropout=0.4
)

optimizer_B = torch.optim.AdamW(model_B.parameters(), lr=0.001, weight_decay=1e-3)
loss_B = nn.MSELoss(reduction='none') 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_K = model_K.to(device)

In [None]:
normal_errors_B, anomaly_errors_B = train(model_B,
        train_loader_B, val_loader_B, anomaly_loader_B,
          optimizer_B,
          loss_B,
          device,
          EPOCHS=10)

In [None]:
plot_errors(normal_errors_B, anomaly_errors_B)
plot_boxplot(normal_errors_B, anomaly_errors_B)

# Saving models

In [148]:
import joblib
from pathlib import Path

# Save to file
output_dir = Path(OUTPUT_DIR_AE)
output_dir.mkdir(exist_ok=True)

def save_model(model, scaler, threshold, route_name):
    # Move model to CPU before saving
    model.cpu()
    # Save everything needed for inference
    lstm_artifacts = {
        "model_state": model.state_dict(),
        "model_config": {
            "input_size": model.input_size,
            "hidden_size": model.hidden_size, 
            "num_layers": model.num_layers,
            "dropout": model.dropout
        },
        "scaler": scaler,
        "threshold": threshold,
        "seq_step_length": SEQUENCE_STEP_LENGTH,
        "sequence_length": SEQUENCE_LENGTH,
        "features": FEATURE_COLUMNS,
        "model_type": "lstm"
    }
    

    # Save for the specific route
    model_filename = output_dir / f"{route_name}_lstm_model.pkl"
    joblib.dump(lstm_artifacts, model_filename)

    print(f"LSTM model saved to {model_filename}")
    return model_filename

In [152]:
models_filenames = []
dispatcher_lstm = {}

threshold_B = np.percentile(normal_errors_B, AUTOENCODER_THRESHOLD_PERCENTILE)
threshold_K = np.percentile(normal_errors_K, AUTOENCODER_THRESHOLD_PERCENTILE)
scaler_K = dataset_KIEL.scaler
scaler_B = dataset_BREM.scaler

# Save BREMERHAVEN model
model_filename_BREM = save_model(model_B, scaler_B, threshold_B, "BREMERHAVEN")
models_filenames.append(model_filename_BREM)
dispatcher_lstm["BREMERHAVEN"] = str(model_filename_BREM)

# Save KIEL model
model_filename_KIEL = save_model(model_K, scaler_K, threshold_K, "KIEL")
models_filenames.append(model_filename_KIEL)
dispatcher_lstm["KIEL"] = str(model_filename_KIEL)

# Save dispatcher
dispatcher_file = output_dir / "dispatcher.pkl"
joblib.dump(dispatcher_lstm, dispatcher_file)
print(f"Dispatcher saved to {dispatcher_file}")

LSTM model saved to models_per_route_lstm_ae/BREMERHAVEN_lstm_model.pkl
LSTM model saved to models_per_route_lstm_ae/KIEL_lstm_model.pkl
Dispatcher saved to models_per_route_lstm_ae/dispatcher.pkl
