In [59]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [60]:
class LSTMModel(nn.Module):
    """
    A PyTorch implementation of a Long Short-Term Memory (LSTM) model for time-series forecasting.
    """

    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers,
                 output_size,
                 dropout=0):

        super(LSTMModel, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Encoder: Compresses the input sequence
        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        # Decoder: Reconstructs the sequence from the encoded representation
        self.decoder = nn.LSTM(
            input_size=hidden_size,  # Input is the encoded representation
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.output_layer = nn.Linear(hidden_size, input_size)

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encode: Get the final hidden state as the compressed representation
        _, (hidden, cell) = self.encoder(x)

        # Prepare decoder input: repeat the encoded representation for each timestep
        # Use the last hidden state as the encoded representation
        encoded = hidden[-1].unsqueeze(1).repeat(1, seq_len, 1)  # Shape: (batch, seq_len, hidden_size)

        # Decode: Reconstruct the sequence
        decoder_output, _ = self.decoder(encoded)

        # Map back to original feature space
        reconstructed = self.output_layer(decoder_output)

        return reconstructed

    def get_reconstruction_error(self, x):
        """Calculate reconstruction error for anomaly detection"""
        #TODO if its near port be more forgiving 
        
        with torch.no_grad():
            reconstructed = self.forward(x)
            # Calculate MSE for each sequence
            mse = torch.mean((x - reconstructed) ** 2, dim=(1, 2))
            return mse.gpu().numpy() #NOTE cpu

    def encode(self, x):
        """Get the encoded representation of input sequences"""
        with torch.no_grad():
            _, (hidden, _) = self.encoder(x)
            return hidden[-1]  # Return the last layer's hidden state



In [61]:
def create_sequences(data, seq_length):
    """
    Creates sequences from time-series data for training recurrent neural networks.

    Args:
        data (np.array): The input time-series data.
        seq_length (int): The length of each input sequence.

    Returns:
        tuple: A tuple containing two numpy arrays:
               - xs (np.array): Input sequences.
               - ys (np.array): Target values (the next step after the sequence).
    """
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i : i + seq_length])
        ys.append(data[i + seq_length])
    return np.array(xs), np.array(ys)

In [62]:
def divide_validation(X, y):
    X_train, y_train, X_val, y_val = [], [], [], []
    # Split the data into training and validation sets


In [17]:
# =============================================================================
# CONFIGURATION PARAMETERS - MODIFY THESE TO EXPERIMENT
# =============================================================================

# Data parameters
DATA_PATH = "LSTM_preprocessed.parquet"
EXPERIMENT_NAME = "lstm_per_route_v1"

# Feature columns to use (must match data preprocessing)
# FEATURE_COLUMNS = [
#     "speed_over_ground", "dv", "dcourse", "ddraft",
#     "zone", "x_km", "y_km", "dist_to_ref", "route_dummy"
# ]

FEATURE_COLUMNS = ['latitude', 'longitude',
                   'speed_over_ground', 'course_over_ground',
                   'zone']


# Sequence parameters
SEQUENCE_LENGTH = 10        # Number of time steps in each sequence

# Training parameters
EPOCHS = 10                 # Maximum epochs (early stopping will likely kick in)
BATCH_SIZE = 16             # Training batch size
PATIENCE = 5               # Early stopping patience
VALIDATION_SIZE = 0.2       # Proportion of data for validation

# Evaluation parameters
AUTOENCODER_THRESHOLD_PERCENTILE = 95  # Threshold for autoencoder (95th percentile)

# Output directories
OUTPUT_DIR_AE = "models_per_route_lstm_ae"    # Autoencoder models

print(f"Configuration loaded for experiment: {EXPERIMENT_NAME}")
print(f"Training epochs: {EPOCHS}, Batch size: {BATCH_SIZE}")
print(f"Output directories: {OUTPUT_DIR_AE}")

Configuration loaded for experiment: lstm_per_route_v1
Training epochs: 10, Batch size: 16
Output directories: models_per_route_lstm_ae


In [18]:
model = LSTMModel(input_size=len(FEATURE_COLUMNS), hidden_size=128, num_layers=1, output_size=len(FEATURE_COLUMNS))
scaler = MinMaxScaler()

In [19]:
df = pd.read_parquet(DATA_PATH)
df.sort_values(['trip_id', 'time_stamp'], inplace=True)

df_KIEL = df[df['start_port'] == 'KIEL'].copy()

def divide_normal_anomaly(df):
    normal_trip_ids = df.groupby('trip_id')['y_true'].all()
    normal_trip_ids = normal_trip_ids[normal_trip_ids == True].index
    df_completely_normal_trips = df[df['trip_id'].isin(normal_trip_ids)].copy()

    anomaly_trip_ids = df.groupby('trip_id')['y_true'].any()
    anomaly_trip_ids = anomaly_trip_ids[anomaly_trip_ids == True].index
    df_trips_with_anomalies = df[df['trip_id'].isin(anomaly_trip_ids)].copy()

    return df_completely_normal_trips, df_trips_with_anomalies

df_normal, df_anomalies = divide_normal_anomaly(df_KIEL)

print(f"Total trips: {len(df_KIEL['trip_id'].unique())}")
print(f"Normal trips: {len(df_normal['trip_id'].unique())}")
print(f"Anomaly trips: {len(df_anomalies['trip_id'].unique())}")

Total trips: 420
Normal trips: 356
Anomaly trips: 62


In [20]:
training_data = df_normal[FEATURE_COLUMNS].values
training_data_scaled = scaler.fit_transform(training_data)

In [21]:
X, y = create_sequences(training_data_scaled, SEQUENCE_LENGTH)

In [22]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [68]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Move model and loss function to CUDA
model = model.to(device)
loss_function = loss_function.to(device)

# Create tensors directly on CUDA
X_train_tensor = torch.from_numpy(X_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Create dataset and dataloader (tensors are already on CUDA)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

cuda


In [25]:
model.train()

epoch_pbar = tqdm(range(EPOCHS), desc="Training LSTM")

for epoch in epoch_pbar:
    train_loss = 0.0
    batch_count = 0
    
    for batch_x, _ in train_loader:
        reconstructed = model(batch_x)
        loss = loss_function(reconstructed, batch_x)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        train_loss += loss.item()
        batch_count += 1
        
        # Update progress bar every 100 batches to show batch progress
        if batch_count % 100 == 0:
            current_avg_loss = train_loss / batch_count
            epoch_pbar.set_postfix({
                'epoch': f'{epoch+1}/{EPOCHS}',
                'batch': f'{batch_count}/{len(train_loader)}',
                'loss': f'{current_avg_loss:.6f}'
            })
    
    # Calculate final average loss for the epoch
    avg_loss = train_loss / batch_count
    epoch_pbar.set_postfix({
        'epoch': f'{epoch+1}/{EPOCHS}',
        'batch': f'{batch_count}/{len(train_loader)}',
        'loss': f'{avg_loss:.6f}'
    })

print("Training completed!")

Training LSTM: 100%|██████████| 10/10 [19:39<00:00, 117.96s/it, epoch=10/10, batch=28362/28362, loss=0.000568]

Training completed!





In [73]:
one_anomaly = df_anomalies[df_anomalies['trip_id'] == df_anomalies['trip_id'].unique()[0]]
one_anomaly = one_anomaly.sort_values('time_stamp')
one_anomaly

Unnamed: 0,trip_id,start_latitude,start_longitude,start_time,end_latitude,end_longitude,end_time,start_port,end_port,time_stamp,...,dv,dcourse,ddraft,zone,zone_0,zone_1,x_km,y_km,dist_to_ref,route_dummy
1237,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-04 08:01:00,...,0.0,0.0,0.0,1,False,True,-263.743269,-31.487677,0.169412,1.0
1238,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-04 08:02:00,...,1.1,5.2,0.0,1,False,True,-263.743269,-31.487677,0.169412,1.0
1239,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-04 08:03:00,...,0.9,0.5,0.0,1,False,True,-263.743269,-31.487677,0.169412,1.0
1240,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-04 08:04:00,...,0.9,3.9,0.0,1,False,True,-263.743269,-31.487677,0.169412,1.0
1241,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-04 08:05:00,...,1.0,1.4,0.0,1,False,True,-263.743269,-31.487677,0.169412,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2878,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-06 02:33:00,...,0.0,0.8,0.0,1,False,True,301.817486,0.578783,25.398605,1.0
2879,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-06 02:34:00,...,0.1,0.5,0.0,1,False,True,301.817486,0.578783,25.398605,1.0
2880,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-06 02:35:00,...,0.1,0.3,0.0,1,False,True,301.817486,0.578783,25.398605,1.0
2881,19585,54.36,10.14,2016-07-04 08:01:00,54.64,18.92,2016-07-06 02:37:00,KIEL,GDYNIA,2016-07-06 02:36:00,...,0.0,0.0,0.0,1,False,True,301.817486,-0.526957,29.041675,1.0


In [50]:
one_anomaly['lstm_prediction'] = 0

# Get reconstruction errors for the anomaly trip
anomaly_tr_features = one_anomaly[FEATURE_COLUMNS].values
anomaly_tr_features_scaled = scaler.transform(anomaly_tr_features)  # Use transform, not fit_transform

X_anomaly, y_anomaly = create_sequences(anomaly_tr_features_scaled, SEQUENCE_LENGTH)
X_anomaly_tensor = torch.from_numpy(X_anomaly).float().to(device)

In [51]:
model.eval()
with torch.no_grad():
    reconstructed = model.forward(X_anomaly_tensor)
    # Calculate MSE for each sequence
    mse = torch.mean((X_anomaly_tensor - reconstructed) ** 2, dim=(1, 2))
    reconstruction_errors = mse.cpu().numpy()

# Calculate threshold (you should ideally calculate this from training data)
threshold = np.percentile(reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)

In [52]:
anomaly_mask = reconstruction_errors > threshold
anomaly_indices = np.where(anomaly_mask)[0]

# Mark points as anomalous based on which sequences they belong to
for seq_idx in anomaly_indices:
    start_point = seq_idx
    end_point = min(seq_idx + SEQUENCE_LENGTH, len(one_anomaly))
    
    # Mark all points in this sequence as anomalous
    one_anomaly.iloc[start_point:end_point, one_anomaly.columns.get_loc('lstm_prediction')] = 1

In [58]:
print("LSTM Predictions Summary:")
print(f"Total points: {len(one_anomaly)}")
print(f"LSTM predicted anomalies: {(one_anomaly['lstm_prediction'] == 1).sum()}")
print(f"Ground truth anomalies: {(one_anomaly['y_true'] == 1).sum()}")

# Calculate some basic metrics
true_positives = ((one_anomaly['y_true'] == 1) & (one_anomaly['lstm_prediction'] == 1)).sum()
false_positives = ((one_anomaly['y_true'] == 0) & (one_anomaly['lstm_prediction'] == 1)).sum()
false_negatives = ((one_anomaly['y_true'] == 1) & (one_anomaly['lstm_prediction'] == 0)).sum()


print(f"True positives {true_positives}") 
print(f"False positives {false_positives}") 
print(f"False Negatives {false_negatives}") 


LSTM Predictions Summary:
Total points: 1646
LSTM predicted anomalies: 185
Ground truth anomalies: 22
True positives 0
False positives 185
False Negatives 22


In [63]:
torch.save(model.state_dict(), "model.pth")

In [70]:
# Add this to your training notebook after training is complete

import joblib
import numpy as np
from pathlib import Path

# Calculate threshold on training data (you should do this properly)
model.eval()
with torch.no_grad():
    reconstructed = model.forward(X_train_tensor)
    mse = torch.mean((X_train_tensor - reconstructed) ** 2, dim=(1, 2))
    train_reconstruction_errors = mse.cpu().numpy()

threshold = np.percentile(train_reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)

OutOfMemoryError: CUDA out of memory. Tried to allocate 229.37 GiB. GPU 

In [76]:
model.to('cpu')


LSTMModel(
  (encoder): LSTM(5, 128, batch_first=True)
  (decoder): LSTM(128, 128, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=5, bias=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)

In [77]:
# Save everything needed for inference
lstm_artifacts = {
    "model_state": model.state_dict(),  # PyTorch model state
    "scaler": scaler,  # MinMaxScaler fitted on training data
    "threshold": threshold,  # Anomaly threshold
    "model_config": {
        "input_size": len(FEATURE_COLUMNS),
        "hidden_size": 128,  # From your model definition
        "num_layers": 1,     # From your model definition
        "sequence_length": SEQUENCE_LENGTH,
        "threshold_percentile": AUTOENCODER_THRESHOLD_PERCENTILE
    },
    "features": FEATURE_COLUMNS,  # Feature names for compatibility
    "model_type": "lstm"  # Identifier for the visualizer
}

In [78]:
# Save to file
output_dir = Path("models_per_route_lstm_ae")
output_dir.mkdir(exist_ok=True)

# Save for the specific route (KIEL in your case)
route_name = "KIEL"  # or get it from your data
model_filename = output_dir / f"{route_name}_lstm_model.pkl"
joblib.dump(lstm_artifacts, model_filename)

print(f"LSTM model saved to {model_filename}")

# Create dispatcher file for LSTM models
dispatcher_lstm = {
    route_name: str(model_filename)
}

dispatcher_file = output_dir / "dispatcher.pkl"
joblib.dump(dispatcher_lstm, dispatcher_file)
print(f"Dispatcher saved to {dispatcher_file}")

print("To use with visualizer, run:")
print(f"python data_visualizer.py <trip_id> --dispatcher {dispatcher_file} --model-type lstm")

LSTM model saved to models_per_route_lstm_ae/KIEL_lstm_model.pkl
Dispatcher saved to models_per_route_lstm_ae/dispatcher.pkl
To use with visualizer, run:
python data_visualizer.py <trip_id> --dispatcher models_per_route_lstm_ae/dispatcher.pkl --model-type lstm
