In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

### LSTMModel interface

In [2]:
class LSTMModel(nn.Module):
    """
    A PyTorch implementation of a Long Short-Term Memory (LSTM) model for time-series forecasting.
    """

    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers,
                 output_size,
                 dropout=0):

        super(LSTMModel, self).__init__()

        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.decoder = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.output_layer = nn.Linear(hidden_size, input_size)

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encode: Get the final hidden state as the compressed representation
        _, (hidden, cell) = self.encoder(x)

        # Prepare decoder input: repeat the encoded representation for each timestep
        # Use the last hidden state as the encoded representation
        encoded = hidden[-1].unsqueeze(1).repeat(1, seq_len, 1)  # Shape: (batch, seq_len, hidden_size)

        # Decode: Reconstruct the sequence
        decoder_output, _ = self.decoder(encoded)

        # Map back to original feature space
        reconstructed = self.output_layer(decoder_output)

        return reconstructed

    def get_reconstruction_error(self, x):
        """Calculate reconstruction error for anomaly detection"""
        with torch.no_grad():
            reconstructed = self.forward(x)
            # Calculate MSE for each sequence
            mse = torch.mean((x - reconstructed) ** 2, dim=(1, 2))
            return mse.cpu().numpy()  # Changed from .gpu() to .cpu()

    def encode(self, x):
        """Get the encoded representation of input sequences"""
        with torch.no_grad():
            _, (hidden, _) = self.encoder(x)
            return hidden[-1]  # Return the last layer's hidden state
    
    def get_score_route(self, sequences, trip_length):
        if len(sequences) == 0:
            return np.zeros(trip_length)
        
        # Convert to tensor and move to same device as model
        X_tensor = torch.from_numpy(sequences).float().to(next(self.parameters()).device)
        
        # Get reconstruction errors
        reconstruction_errors = self.get_reconstruction_error(X_tensor)
        
        # Initialize scores array
        scores = np.zeros(trip_length)
        
        # Map reconstruction errors to trajectory points
        sequence_length = sequences.shape[1]
        
        for i, error in enumerate(reconstruction_errors):
            start_idx = i
            end_idx = min(i + sequence_length, len(scores))
            # Use maximum error for overlapping windows
            scores[start_idx:end_idx] = np.maximum(scores[start_idx:end_idx], error)
        
        return scores

### Configs

In [3]:
# =============================================================================
# CONFIGURATION PARAMETERS - MODIFY THESE TO EXPERIMENT
# =============================================================================

# Data parameters
DATA_PATH = "LSTM_preprocessed.parquet"
EXPERIMENT_NAME = "lstm_per_route_v1"

# FEATURE_COLUMNS = ['latitude', 'longitude',
#                    'speed_over_ground', 'course_over_ground'
#                    ]

FEATURE_COLUMNS = [ "speed_over_ground", "course_over_ground",
                    "x_km", "y_km", "dist_to_ref", "zone"
                  ]

# Sequence parameters
SEQUENCE_LENGTH = 15        # Number of time steps in each sequence

# Training parameters
EPOCHS = 4                 # Maximum epochs
BATCH_SIZE = 16            
VALIDATION_SIZE = 0.2       # Proportion of data for validation

HIDDEN_SIZE = 64
NUM_LAYERS = 1 

# Evaluation parameters
AUTOENCODER_THRESHOLD_PERCENTILE = 95

# Output directories
OUTPUT_DIR_AE = "models_per_route_lstm_ae"    # Autoencoder models

scaler = MinMaxScaler()

print(f"Configuration loaded for experiment: {EXPERIMENT_NAME}")
print(f"Training epochs: {EPOCHS}, Batch size: {BATCH_SIZE}")
print(f"Output directories: {OUTPUT_DIR_AE}")

Configuration loaded for experiment: lstm_per_route_v1
Training epochs: 4, Batch size: 16
Output directories: models_per_route_lstm_ae


### Utils

In [4]:
def create_sequences(data, seq_length):
    """
    Creates sequences from time-series data for training recurrent neural networks.
    """
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i : i + seq_length])
        ys.append(data[i + seq_length])
    return np.array(xs), np.array(ys)

def create_sequences_for_trip(trip_data, feature_columns, sequence_length, scaler):
    # Extract and scale features
    features = trip_data[feature_columns].values
    features_scaled = scaler.transform(features)
    
    # Create sequences
    if len(features_scaled) < sequence_length:
        return np.array([])  # Not enough points for even one sequence
    
    sequences = []
    for i in range(len(features_scaled) - sequence_length + 1):
        sequences.append(features_scaled[i:i + sequence_length])
    
    return np.array(sequences)
    
def divide_validation(X, y):
    X_train, y_train, X_val, y_val = [], [], [], []
    # Split the data into training and validation sets

def divide_normal_anomaly(df):
    normal_trip_ids = df.groupby('trip_id')['y_true'].all()
    normal_trip_ids = normal_trip_ids[normal_trip_ids == True].index
    df_completely_normal_trips = df[df['trip_id'].isin(normal_trip_ids)].copy()

    anomaly_trip_ids = df.groupby('trip_id')['y_true'].any()
    anomaly_trip_ids = anomaly_trip_ids[anomaly_trip_ids == True].index
    df_trips_with_anomalies = df[df['trip_id'].isin(anomaly_trip_ids)].copy()


    print(f"Total trips: {len(df['trip_id'].unique())}")
    print(f"Normal trips: {len(df_completely_normal_trips['trip_id'].unique())}")
    print(f"Anomaly trips: {len(df_trips_with_anomalies['trip_id'].unique())}")
    
    return df_completely_normal_trips, df_trips_with_anomalies
    
def get_data_loader(df):
    data_features = df[FEATURE_COLUMNS].values
    data_scaled = scaler.fit_transform(data_features)

    X, y = create_sequences(data_scaled, SEQUENCE_LENGTH)

    X_tensor = torch.from_numpy(X).float()
    y_tensor = torch.from_numpy(y).float()
    
    dataset = TensorDataset(X_tensor, y_tensor)

    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

    return loader

def load_data_route(route, datapath = DATA_PATH):
    df = pd.read_parquet(DATA_PATH)
    df.sort_values(['trip_id', 'time_stamp'], inplace=True)
    df_route = df[df['start_port'] == route].copy()

    return df_route

def get_anomaly_example(df_anomalies):
    
    one_anomaly = df_anomalies[df_anomalies['trip_id'] == df_anomalies['trip_id'].unique()[0]].copy()
    one_anomaly['lstm_prediction'] = 0
    
    one_anomaly = one_anomaly.sort_values('time_stamp')
    one_an_scaled = scaler.transform(one_anomaly[FEATURE_COLUMNS].values)
    X_anomaly, y_anomaly = create_sequences(one_an_scaled, SEQUENCE_LENGTH)
    X_anomaly_tensor = torch.from_numpy(X_anomaly).float()

    return one_anomaly, X_anomaly_tensor, y_anomaly
    
def anomaly_check(one_anomaly):
    print("LSTM Predictions Summary:")
    print(f"Total points: {len(one_anomaly)}")
    print(f"LSTM predicted anomalies: {(one_anomaly['lstm_prediction'] == 1).sum()}")
    print(f"Ground truth anomalies: {(one_anomaly['y_true'] == 1).sum()}")

    # Calculate some basic metrics
    true_positives = ((one_anomaly['y_true'] == 1) & (one_anomaly['lstm_prediction'] == 1)).sum()
    false_positives = ((one_anomaly['y_true'] == 0) & (one_anomaly['lstm_prediction'] == 1)).sum()
    false_negatives = ((one_anomaly['y_true'] == 1) & (one_anomaly['lstm_prediction'] == 0)).sum()

    print(f"True positives: {true_positives}")
    print(f"False positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")

In [5]:
def train(model,
          train_loader, 
          loss_function,
          optimizer,
          X_anomaly_tensor, 
          one_anomaly):
    best_val_loss = float('inf')
    patience_counter = 0
    
    # Enable cudnn benchmarking for better performance
    torch.backends.cudnn.benchmark = True
    
    # Mixed precision training for better memory usage
    # scaler_amp = torch.cuda.amp.GradScaler()
    
    for epoch in range(EPOCHS):
        # Training phase
        model.train()
        train_loss = 0.0
        num_batches = 0
    
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        train_reconstruction_errors = []
    
        for batch_idx, (batch_x, _) in enumerate(pbar):
            # Move batch to GPU
            batch_x = batch_x.to(device)
    
            # Mixed precision forward pass
            with torch.cuda.amp.autocast():
                reconstructed = model(batch_x)
                loss = loss_function(reconstructed, batch_x)
    
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            # Gradient clipping to prevent exploding gradients
            # scaler_amp.unscale_(optimizer)
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
            # scaler_amp.step(optimizer)
            # scaler_amp.update()
    
            batch_errors = model.get_reconstruction_error(batch_x)
            train_reconstruction_errors.extend(batch_errors)
            
            train_loss += loss.item()
            num_batches += 1
    
            # Update progress bar
            if batch_idx % 10 == 0:
                pbar.set_postfix({'loss': f'{train_loss/num_batches:.6f}'})
    
            # Clear cache periodically
            if batch_idx % 100 == 0:
                torch.cuda.empty_cache()
    
        # Validation phase
        model.eval()

        # val_loss = 0.0
        # num_val_batches = 0

        # with torch.no_grad():
        #     for batch_x, _ in val_loader:
        #         batch_x = batch_x.to(device)
        #         reconstructed = model(batch_x)
        #         loss = loss_function(reconstructed, batch_x)
        #         val_loss += loss.item()
        #         num_val_batches += 1
        #
        # avg_train_loss = train_loss / num_batches
        # avg_val_loss = val_loss / num_val_batches
        #
        # print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.6f}, Val Loss = {avg_val_loss:.6f}")
        #
        # # Learning rate scheduling
        # scheduler.step(avg_val_loss)
        #
        # # Early stopping
        # if avg_val_loss < best_val_loss:
        #     best_val_loss = avg_val_loss
        #     patience_counter = 0
        #     # Save best model
        #     torch.save(model.state_dict(), 'best_model.pth')
        # else:
        #     patience_counter += 1
        #     if patience_counter >= PATIENCE:
        #         print(f"Early stopping at epoch {epoch+1}")
        #         break
    
        # train_reconstruction_errors = model.get_reconstruction_error(X_train_tensor)
        # threshold = np.percentile(train_reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)
    
        train_reconstruction_errors = np.array(train_reconstruction_errors)
        threshold = np.percentile(train_reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)
        print(f"Anomaly threshold: {threshold:.6f}")
    
        reconstruction_errors = model.get_reconstruction_error(X_anomaly_tensor)
    
        anomaly_mask = reconstruction_errors > threshold
        anomaly_indices = np.where(anomaly_mask)[0]
    
        # Mark points as anomalous based on which sequences they belong to
        for seq_idx in anomaly_indices:
            start_point = seq_idx
            end_point = min(seq_idx + SEQUENCE_LENGTH, len(one_anomaly))
    
            # Mark all points in this sequence as anomalous
            one_anomaly.iloc[start_point:end_point, one_anomaly.columns.get_loc('lstm_prediction')] = 1
    
        anomaly_check(one_anomaly)
    
    # Load best model
    # model.load_state_dict(torch.load('best_model.pth'))
    print("Training completed!")

def evaluate_model(model, train_loader):
    model.eval()
    train_reconstruction_errors = []
    
    with torch.no_grad():
        for batch_x, _ in train_loader:
            batch_x = batch_x.to(device)
            batch_errors = model.get_reconstruction_error(batch_x)
            train_reconstruction_errors.extend(batch_errors)
    
    train_reconstruction_errors = np.array(train_reconstruction_errors)
    threshold = np.percentile(train_reconstruction_errors, AUTOENCODER_THRESHOLD_PERCENTILE)
    print(f"Anomaly threshold: {threshold:.6f}")

    return threshold

### Data load and devide

In [6]:
df_KIEL = load_data_route("KIEL")
df_BREM = load_data_route("BREMERHAVEN")

print("BREMERHAVEN")
df_normal_B, df_anomalies_B = divide_normal_anomaly(df_BREM)
train_loader_B = get_data_loader(df_normal_B)

print("\nKIEL")
df_normal_K, df_anomalies_K = divide_normal_anomaly(df_KIEL)
train_loader_K = get_data_loader(df_normal_K)

BREMERHAVEN
Total trips: 702
Normal trips: 650
Anomaly trips: 14

KIEL
Total trips: 420
Normal trips: 356
Anomaly trips: 62


In [12]:
anomaly_B, X_anomaly_tensor_B, _ = get_anomaly_example(df_anomalies_B)
anomaly_K, X_anomaly_tensor_K, _ = get_anomaly_example(df_anomalies_K)

### Model configure and train

In [13]:
model_BREM = LSTMModel(input_size=len(FEATURE_COLUMNS), hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(FEATURE_COLUMNS))
loss_function_BREM = nn.MSELoss()
optimizer_BREM = torch.optim.Adam(model_BREM.parameters(), lr=0.001)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

model_KIEL = LSTMModel(input_size=len(FEATURE_COLUMNS), hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(FEATURE_COLUMNS))
loss_function_KIEL = nn.MSELoss()
optimizer_KIEL = torch.optim.Adam(model_KIEL.parameters(), lr=0.001)

In [26]:
# Move to CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model_BREM = model_BREM.to(device)
model_KIEL = model_KIEL.to(device)


X_anomaly_tensor_B = X_anomaly_tensor_B.to(device)
X_anomaly_tensor_K = X_anomaly_tensor_K.to(device)

cuda


In [15]:
train(model_BREM, train_loader_B, loss_function_BREM, optimizer_BREM, X_anomaly_tensor_B, anomaly_B)

Epoch 1/4: 100%|██████████| 21999/21999 [01:31<00:00, 240.84it/s, loss=0.001679]


Anomaly threshold: 0.006242
LSTM Predictions Summary:
Total points: 659
LSTM predicted anomalies: 296
Ground truth anomalies: 228
True positives: 79
False positives: 217
False Negatives: 149


Epoch 2/4: 100%|██████████| 21999/21999 [01:39<00:00, 220.28it/s, loss=0.000423]


Anomaly threshold: 0.001429
LSTM Predictions Summary:
Total points: 659
LSTM predicted anomalies: 614
Ground truth anomalies: 228
True positives: 228
False positives: 386
False Negatives: 0


Epoch 3/4: 100%|██████████| 21999/21999 [01:31<00:00, 239.54it/s, loss=0.000282]


Anomaly threshold: 0.000725
LSTM Predictions Summary:
Total points: 659
LSTM predicted anomalies: 658
Ground truth anomalies: 228
True positives: 228
False positives: 430
False Negatives: 0


Epoch 4/4: 100%|██████████| 21999/21999 [02:03<00:00, 178.42it/s, loss=0.000226]


Anomaly threshold: 0.000493
LSTM Predictions Summary:
Total points: 659
LSTM predicted anomalies: 658
Ground truth anomalies: 228
True positives: 228
False positives: 430
False Negatives: 0
Training completed!


In [16]:
trashhold_BREM = evaluate_model(model_BREM, train_loader_B)

Anomaly threshold: 0.000699


In [17]:
train(model_KIEL, train_loader_K, loss_function_KIEL, optimizer_KIEL, X_anomaly_tensor_K, anomaly_K)

Epoch 1/4: 100%|██████████| 28364/28364 [01:42<00:00, 276.14it/s, loss=0.000351]


Anomaly threshold: 0.000878
LSTM Predictions Summary:
Total points: 1646
LSTM predicted anomalies: 0
Ground truth anomalies: 22
True positives: 0
False positives: 0
False Negatives: 22


Epoch 2/4: 100%|██████████| 28364/28364 [02:30<00:00, 188.22it/s, loss=0.000087]


Anomaly threshold: 0.000152
LSTM Predictions Summary:
Total points: 1646
LSTM predicted anomalies: 25
Ground truth anomalies: 22
True positives: 19
False positives: 6
False Negatives: 3


Epoch 3/4: 100%|██████████| 28364/28364 [02:38<00:00, 178.87it/s, loss=0.000063]


Anomaly threshold: 0.000106
LSTM Predictions Summary:
Total points: 1646
LSTM predicted anomalies: 27
Ground truth anomalies: 22
True positives: 21
False positives: 6
False Negatives: 1


Epoch 4/4: 100%|██████████| 28364/28364 [02:26<00:00, 193.39it/s, loss=0.000052]

Anomaly threshold: 0.000090
LSTM Predictions Summary:
Total points: 1646
LSTM predicted anomalies: 28
Ground truth anomalies: 22
True positives: 21
False positives: 7
False Negatives: 1
Training completed!





In [27]:
trashhold_KIEL = evaluate_model(model_KIEL, train_loader_K)

Anomaly threshold: 0.000075


In [28]:
# # Prepare anomaly data
# anomaly_data = one_anomaly[FEATURE_COLUMNS].values
# anomaly_data_scaled = scaler.transform(anomaly_data)
#
# if len(anomaly_data_scaled) >= SEQUENCE_LENGTH:
#     X_anomaly, _ = create_sequences(anomaly_data_scaled, SEQUENCE_LENGTH)
#
#     # Process in batches to avoid OOM
#     batch_size_test = 64
#     anomaly_mask = []
#
#     for i in range(0, len(X_anomaly), batch_size_test):
#         batch = X_anomaly[i:i+batch_size_test]
#         batch_tensor = torch.FloatTensor(batch).to(device)
#
#         with torch.no_grad():
#             batch_errors = model.get_reconstruction_error(batch_tensor)
#             batch_mask = batch_errors > threshold
#             anomaly_mask.extend(batch_mask)
#
#         # Clear cache after each batch
#         torch.cuda.empty_cache()
#
#     anomaly_mask = np.array(anomaly_mask)
#     anomaly_indices = np.where(anomaly_mask)[0]
#
#     # Initialize prediction column
#     one_anomaly['lstm_prediction'] = 0
#
#     # Mark anomalous sequences
#     for seq_idx in anomaly_indices:
#         start_point = seq_idx
#         end_point = min(seq_idx + SEQUENCE_LENGTH, len(one_anomaly))
#         one_anomaly.iloc[start_point:end_point, one_anomaly.columns.get_loc('lstm_prediction')] = 1

In [29]:
import joblib
from pathlib import Path

# Save to file
output_dir = Path("models_per_route_lstm_ae")
output_dir.mkdir(exist_ok=True)

def save_model(model, scaler, threshold, route_name):
    # Move model to CPU before saving
    model.cpu()
    
    # Save everything needed for inference
    lstm_artifacts = {
        "model_state": model.state_dict(),
        "scaler": scaler,
        "threshold": threshold,
        "model_config": {
            "input_size": len(FEATURE_COLUMNS),
            "hidden_size": HIDDEN_SIZE, 
            "num_layers": NUM_LAYERS,
            "sequence_length": SEQUENCE_LENGTH,
            "threshold_percentile": AUTOENCODER_THRESHOLD_PERCENTILE
        },
        "features": FEATURE_COLUMNS,
        "model_type": "lstm"
    }
    

    # Save for the specific route
    model_filename = output_dir / f"{route_name}_lstm_model.pkl"
    joblib.dump(lstm_artifacts, model_filename)

    print(f"LSTM model saved to {model_filename}")
    return model_filename

In [30]:
models_filenames = []
dispatcher_lstm = {}

# Save BREMERHAVEN model
model_filename_BREM = save_model(model_BREM, scaler, trashhold_BREM, "BREMERHAVEN")
models_filenames.append(model_filename_BREM)
dispatcher_lstm["BREMERHAVEN"] = str(model_filename_BREM)

# Save KIEL model
model_filename_KIEL = save_model(model_KIEL, scaler, trashhold_KIEL, "KIEL")
models_filenames.append(model_filename_KIEL)
dispatcher_lstm["KIEL"] = str(model_filename_KIEL)

# Save dispatcher
dispatcher_file = output_dir / "dispatcher.pkl"
joblib.dump(dispatcher_lstm, dispatcher_file)
print(f"Dispatcher saved to {dispatcher_file}")

LSTM model saved to models_per_route_lstm_ae/BREMERHAVEN_lstm_model.pkl
LSTM model saved to models_per_route_lstm_ae/KIEL_lstm_model.pkl
Dispatcher saved to models_per_route_lstm_ae/dispatcher.pkl


In [31]:
def get_trip_anomaly_predictions(model, trip_data, feature_columns, sequence_length, scaler, threshold):
    """
    Get anomaly predictions for a complete trip
    
    Args:
        model: trained LSTM model
        trip_data: DataFrame containing trip points
        feature_columns: list of feature column names
        sequence_length: length of each sequence
        scaler: fitted MinMaxScaler
        threshold: anomaly threshold
    
    Returns:
        predictions: binary array (0=normal, 1=anomaly) for each point
    """
    # Create sequences
    sequences = create_sequences_for_trip(trip_data, feature_columns, sequence_length, scaler)
    
    if len(sequences) == 0:
        return np.zeros(len(trip_data))
    
    # Get anomaly scores
    scores = model.get_score_route(sequences, trip_data)
    
    # Convert to binary predictions
    predictions = (scores > threshold).astype(int)
    
    return predictions

In [42]:
# Get predictions for the anomaly trip
predictions = get_trip_anomaly_predictions(
    model=model_KIEL,
    trip_data=anomaly_K,
    feature_columns=FEATURE_COLUMNS,
    sequence_length=SEQUENCE_LENGTH,
    scaler=scaler,
    threshold=trashhold_KIEL  # Use the threshold you calculated
)

# Update the lstm_prediction column
anomaly_K['lstm_prediction'] = predictions

In [76]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from ipywidgets import interact, widgets, HBox, VBox
from IPython.display import display
import warnings


def create_interactive_anomaly_visualization(df, trip_id=None, title_suffix=""):
    """
    Create an interactive visualization for ship trajectory anomaly detection
    
    Args:
        df: DataFrame with columns ['longitude', 'latitude', 'y_true', 'lstm_prediction', 'time_stamp', etc.]
        trip_id: Specific trip ID to visualize (if None, uses first available)
        title_suffix: Additional text for the title
    """
    
    if trip_id is None:
        trip_id = df['trip_id'].iloc[0] if 'trip_id' in df.columns else "Unknown"
    
    # Filter data for specific trip if trip_id column exists
    if 'trip_id' in df.columns:
        trip_data = df[df['trip_id'] == trip_id].copy()
    else:
        trip_data = df.copy()
    
    # Sort by timestamp if available
    if 'time_stamp' in trip_data.columns:
        trip_data = trip_data.sort_values('time_stamp').reset_index(drop=True)
    
    # Ensure we have the required columns
    required_columns = ['longitude', 'latitude']
    if not all(col in trip_data.columns for col in required_columns):
        print(f"Missing required columns. Available columns: {trip_data.columns.tolist()}")
        return None
    
    # Create y_true column if it doesn't exist (for demo purposes)
    if 'y_true' not in trip_data.columns:
        print("mock y_true")
        trip_data['y_true'] = np.random.choice([0, 1], size=len(trip_data), p=[0.85, 0.15])
    
    # Create lstm_prediction column if it doesn't exist (for demo purposes)
    if 'lstm_prediction' not in trip_data.columns:
        print("mock lstm_prediction")
        trip_data['lstm_prediction'] = np.random.choice([0, 1], size=len(trip_data), p=[0.8, 0.2])
    
    # Prepare data for visualization
    trip_data['point_index'] = range(len(trip_data))
    trip_data['color'] = trip_data['y_true'].map({0: 'blue', 1: 'red'})
    trip_data['anomaly_status'] = trip_data['y_true'].map({0: 'Normal', 1: 'Anomaly'})
    
    # Add LSTM prediction status
    trip_data['lstm_status'] = trip_data['lstm_prediction'].map({0: 'Normal', 1: 'Predicted Anomaly'})
    
    # Create hover text
    hover_text = []
    for idx, row in trip_data.iterrows():
        text = f"Point: {row['point_index']}<br>"
        text += f"Lat: {row['latitude']:.4f}<br>"
        text += f"Lon: {row['longitude']:.4f}<br>"
        text += f"Ground Truth: {row['anomaly_status']}<br>"
        text += f"LSTM Prediction: {row['lstm_status']}<br>"
        
        if 'time_stamp' in row:
            text += f"Time: {row['time_stamp']}<br>"
        if 'speed_over_ground' in row:
            text += f"Speed: {row['speed_over_ground']:.2f}<br>"
        if 'course_over_ground' in row:
            text += f"Course: {row['course_over_ground']:.2f}<br>"
            
        hover_text.append(text)
    
    trip_data['hover_text'] = hover_text
    
    # Create the main plot
    fig = go.Figure()
    
    # Add normal points (blue)
    normal_points = trip_data[trip_data['y_true'] == 0]
    if len(normal_points) > 0:
        fig.add_trace(go.Scattermapbox(
            lat=normal_points['latitude'],
            lon=normal_points['longitude'],
            mode='markers',
            marker=dict(
                size=8,
                color='#3498db',
                opacity=0.8
            ),
            text=normal_points['hover_text'],
            hovertemplate='%{text}<extra></extra>',
            name='Normal Points',
            showlegend=True
        ))
    
    # Add anomaly points (red)
    anomaly_points = trip_data[trip_data['y_true'] == 1]
    if len(anomaly_points) > 0:
        fig.add_trace(go.Scattermapbox(
            lat=anomaly_points['latitude'],
            lon=anomaly_points['longitude'],
            mode='markers',
            marker=dict(
                size=10,
                color='#e74c3c',
                opacity=0.9,
                symbol='circle'
            ),
            text=anomaly_points['hover_text'],
            hovertemplate='%{text}<extra></extra>',
            name='Anomaly Points',
            showlegend=True
        ))
    
    # Add LSTM predictions if different from ground truth
    lstm_only_anomalies = trip_data[
        (trip_data['lstm_prediction'] == 1) & (trip_data['y_true'] == 0)
    ]
    if len(lstm_only_anomalies) > 0:
        fig.add_trace(go.Scattermapbox(
            lat=lstm_only_anomalies['latitude'],
            lon=lstm_only_anomalies['longitude'],
            mode='markers',
            marker=dict(
                size=8,
                color='#FFC0CB',
                opacity=0.8,
                symbol='circle'
            ),
            text=lstm_only_anomalies['hover_text'],
            hovertemplate='%{text}<extra></extra>',
            name='LSTM Predicted Anomalies',
            showlegend=True
        ))
    
    # Mark start point (green)
    start_point = trip_data.iloc[0]
    fig.add_trace(go.Scattermapbox(
        lat=[start_point['latitude']],
        lon=[start_point['longitude']],
        mode='markers',
        marker=dict(
            size=15,
            color='#27ae60',
            symbol='circle',
            opacity=1.0
        ),
        text=f"START<br>{start_point['hover_text']}",
        hovertemplate='%{text}<extra></extra>',
        name='Start Point',
        showlegend=True
    ))
    
    # Mark end point (yellow)
    end_point = trip_data.iloc[-1]
    fig.add_trace(go.Scattermapbox(
        lat=[end_point['latitude']],
        lon=[end_point['longitude']],
        mode='markers',
        marker=dict(
            size=15,
            color='#f1c40f',
            symbol='cros',
            opacity=1.0
        ),
        text=f"END PORT<br>{end_point['hover_text']}",
        hovertemplate='%{text}<extra></extra>',
        name='End Port',
        showlegend=True
    ))
    
    # Calculate center and zoom
    center_lat = trip_data['latitude'].mean()
    center_lon = trip_data['longitude'].mean()
    
    lat_range = trip_data['latitude'].max() - trip_data['latitude'].min()
    lon_range = trip_data['longitude'].max() - trip_data['longitude'].min()
    zoom = max(1, min(15, 12 - np.log(max(lat_range, lon_range, 0.001))))
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=f'Ship Trajectory Anomaly Detection - Trip {trip_id} {title_suffix}',
            x=0.5,
            font=dict(size=16, color='#2c3e50')
        ),
        mapbox=dict(
            style="open-street-map",
            center=dict(lat=center_lat, lon=center_lon),
            zoom=zoom
        ),
        margin=dict(r=0, t=50, l=0, b=0),
        height=600,
        showlegend=True,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01,
            bgcolor="rgba(255, 255, 255, 0.8)",
            bordercolor="rgba(0, 0, 0, 0.2)",
            borderwidth=1
        )
    )
    
    return fig


In [77]:
trip_id =  anomaly_K['trip_id'].unique()[0]
fig = create_interactive_anomaly_visualization(anomaly_K, trip_id)

from pathlib import Path
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)


fig.write_html(f"data/test_trip_{trip_id}_anomalies.html", include_plotlyjs='cdn')


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

