In [34]:
# Cell: Imports and Setup for BiGRU
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import joblib
from tqdm.notebook import tqdm # For progress bars in notebook

# Assuming 'utils.py' contains visualize_trip_interactive for plotting
# If not available, you would need to implement or remove the visualization calls.
from utils import visualize_trip_interactive


In [35]:
df = pd.read_parquet('../data/cleaned/bremerhaven_anomalies_partlabeled.parquet')
df["time_stamp"] = pd.to_datetime(df["time_stamp"])
df = df.sort_values(by=["time_stamp", "trip_id"])

In [36]:
# Get trips that have any anomalies
df = df[df['is_anomaly'].notna()]
trips_with_anomalies = df.groupby('trip_id')['is_anomaly'].any()
anomaly_trip_ids = trips_with_anomalies[trips_with_anomalies].index
normal_trip_ids = trips_with_anomalies[~trips_with_anomalies].index

In [40]:
df['is_anomaly']

1222     False
1320     False
1321     False
1322     False
1324     False
         ...  
25452    False
25451    False
25454    False
25455    False
25456    False
Name: is_anomaly, Length: 26368, dtype: object

In [32]:
    # Split dataframes
df_anomaly_trips = df[df['trip_id'].isin(anomaly_trip_ids)]
df_normal_trips = df[df['trip_id'].isin(normal_trip_ids)]

# Print results
print(f"Trips with anomalies: {len(anomaly_trip_ids)}")
print(f"Trips without anomalies: {len(normal_trip_ids)}")
print(f"Total points in anomaly trips: {len(df_anomaly_trips)}")
print(f"Total points in normal trips: {len(df_normal_trips)}")

Trips with anomalies: 54
Trips without anomalies: 0
Total points in anomaly trips: 26368
Total points in normal trips: 0


In [33]:
# Cell: Helper function to create sequences
def create_sequences(data, seq_length):
    """
    Creates sequences from time-series data for training recurrent neural networks.

    Args:
        data (np.array): The input time-series data.
        seq_length (int): The length of each input sequence.

    Returns:
        tuple: A tuple containing two numpy arrays:
               - xs (np.array): Input sequences.
               - ys (np.array): Target values (the next step after the sequence).
    """
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i : i + seq_length])
        ys.append(data[i + seq_length])
    return np.array(xs), np.array(ys)


In [20]:

# Cell: BiGRU Model Definition
class BiGRUModel(nn.Module):
    """
    Bidirectional GRU (BiGRU) model for time series prediction.
    Predicts the next step in a sequence.
    """
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiGRUModel, self).__init__()
        # Bidirectional GRU layer: processes sequence in both forward and backward directions
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        # Fully connected layer to map GRU output to desired output size
        # hidden_size * 2 because of bidirectional output (concatenates forward and backward)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        # Pass input through GRU layer
        # out: output features from the last layer of the GRU for each timestep
        # _: hidden state for each element in the batch (not directly used here for prediction)
        out, _ = self.gru(x)
        # Get the output from the last time step of the sequence and pass through the fully connected layer
        out = self.fc(out[:, -1, :])
        return out

    def load_model(self, path):
        """
        Loads the saved state dictionary into the model.

        Args:
            path (str): The file path to the saved model state dictionary.
        """
        try:
            self.load_state_dict(torch.load(path))
            self.eval()  # Set the model to evaluation mode after loading
            print(f"Model loaded successfully from {path}")
        except FileNotFoundError:
            print(f"Error: Model file not found at {path}")
        except Exception as e:
            print(f"An error occurred while loading the model: {e}")

In [21]:


# Cell: Model Initialization and Hyperparameters
NUM_EPOCHS = 10 # Number of training epochs
features = ['latitude', 'longitude', 'speed_over_ground', 'course_over_ground'] # Features used for training
SEQ_LENGTH = 20 # Length of each input sequence for the BiGRU model
BATCH_SIZE = 32 # Batch size for DataLoader during training

# Initialize the BiGRU model
# input_size: number of features for each time step
# hidden_size: number of features in the hidden state
# num_layers: number of recurrent layers
# output_size: number of features to predict (same as input_size for next-step prediction)
model = BiGRUModel(input_size=len(features), hidden_size=128, num_layers=1, output_size=len(features))

# Initialize the MinMaxScaler for feature scaling
scaler = MinMaxScaler()


In [None]:

# Cell: Prepare Training Data (Assuming 'normal_df' is your pre-loaded dataset of normal trajectories)
# Example: If your normal data is in a DataFrame named `normal_trajectory_data_df`
# replace `normal_df` with your actual DataFrame name.
# It should contain 'time_stamp' and the 'features' columns.
# For demonstration, let's create a placeholder `normal_df` if it doesn't exist.
try:
    # This assumes 'df' is loaded from your previous steps and
    # 'normal_trip_ids' is defined from a prior clustering or filtering process.
    # If you are loading a dataset that *only* contains normal trajectories,
    # you would replace this with your direct dataset loading.
    if 'df' in locals() and 'normal_trip_ids' in locals() and normal_trip_ids:
        print("Using normal trips identified from previous clustering for training.")
        normal_df = df[df['trip_id'].isin(normal_trip_ids)].sort_values(by='time_stamp')
    else:
        print("Assuming 'normal_df' needs to be loaded or defined from a dedicated normal dataset.")
        # Placeholder for loading your pre-filtered normal trajectory data
        # IMPORTANT: Replace this with your actual data loading for normal trajectories
        # Example: normal_df = pd.read_parquet("path/to/your/normal_trajectories.parquet")
        # Ensure it has 'latitude', 'longitude', 'speed_over_ground', 'course_over_ground'
        # and 'time_stamp' columns.

        # Creating a dummy normal_df for demonstration if not already present
        # In a real scenario, this would be your actual normal data.
        if 'df' in locals() and not df.empty:
            print("Using a subset of the main DataFrame as 'normal_df' for demonstration.")
            normal_df = df[df['trip_id'] == df['trip_id'].iloc[0]].sort_values(by='time_stamp').copy()
        else:
            print("Cannot create dummy normal_df. Please load your normal trajectory dataset.")
            normal_df = pd.DataFrame() # Empty DataFrame to prevent errors


    if not normal_df.empty:
        # Extract features and scale them
        training_data = normal_df[features].values
        print(f"Original training data shape: {training_data.shape}")
        training_data_scaled = scaler.fit_transform(training_data)
        print(f"Scaled training data shape: {training_data_scaled.shape}")
    else:
        print("Normal trajectory data is empty. Cannot prepare training data.")
        training_data_scaled = None # Set to None to indicate no data
except NameError:
    print("Dependencies (e.g., 'df' or 'normal_trip_ids') not found. Please ensure your normal data is loaded into 'normal_df'.")
    training_data_scaled = None


In [None]:

# Cell: Train the BiGRU Model
if training_data_scaled is not None and len(training_data_scaled) > SEQ_LENGTH:
    X_train, y_train = create_sequences(training_data_scaled, SEQ_LENGTH)
    print(f"Created {len(X_train)} sequences for training.")

    # Convert numpy arrays to PyTorch tensors
    X_train_tensor = torch.from_numpy(X_train).float()
    y_train_tensor = torch.from_numpy(y_train).float()

    # Create DataLoader for batching and shuffling
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    print("Building and training the BiGRU model...")
    # Define loss function (Mean Squared Error for regression) and optimizer (Adam)
    loss_function = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    for epoch in tqdm(range(NUM_EPOCHS), desc="Training BiGRU"):
        for sequences, labels in train_loader:
            outputs = model(sequences)
            loss = loss_function(outputs, labels)

            optimizer.zero_grad() # Clear gradients before backward pass
            loss.backward()       # Perform backpropagation
            optimizer.step()      # Update model weights
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}")
    print("Model training complete.")
else:
    print("Model training skipped: Not enough normal data or data not prepared correctly.")


In [None]:

# Cell: Save the trained model
if 'model' in locals() and model is not None:
    torch.save(model.state_dict(), "data/bigru_model.pth")
    print("Model saved to data/bigru_model.pth")
else:
    print("Model not trained or not found. Skipping save.")

In [None]:
# Cell: Load the trained model
# This cell can be run independently to load a previously saved model
loaded_model = BiGRUModel(input_size=len(features), hidden_size=128, num_layers=1, output_size=len(features))
loaded_model.load_model("data/bigru_model.pth")
# Assign the loaded model back to 'model' variable if you want to continue using it
model = loaded_model


# Cell: Anomaly Detection for a Test Trajectory
print("\n--- Performing Anomaly Detection on a Test Trajectory ---")

# For this section, you would typically load a *new* test trajectory
# which might or might not contain anomalies.
# For demonstration, we'll use a portion of the 'normal_df' or a dummy test trip.
# IMPORTANT: Replace this with loading your actual test trajectory data.
# Example: test_df = pd.read_parquet("path/to/your/test_trajectory_with_anomalies.parquet")
# Ensure it has 'time_stamp' and the 'features' columns.

if 'normal_df' in locals() and not normal_df.empty:
    # Using a portion of the normal data as a 'test trip' for demonstration
    # You would replace this with loading your actual test data.
    test_df = normal_df.head(100).copy() # Take first 100 points as a test trip
    test_trip_id = "demonstration_normal_trip" # Assign a dummy ID

    # Optional: Introduce a *fake* anomaly for demonstration purposes
    # This simulates a sudden change in Speed Over Ground (SOG)
    # Ensure indices are within bounds of test_data
    test_data = test_df[features].values
    if len(test_data) > 25:
        print("Injecting a fake anomaly (sudden SOG increase) for demonstration.")
        test_data[20:25, 2] = 50 # SOG value (knots)
    else:
        print("Test data too short to inject fake anomaly.")

    # Scale the test data using the *same scaler* fitted on training data
    test_data_scaled = scaler.transform(test_data)
    X_test, y_test_actual = create_sequences(test_data_scaled, SEQ_LENGTH)

    if len(X_test) > 0:
        X_test_tensor = torch.from_numpy(X_test).float()

        # Set model to evaluation mode
        model.eval()
        with torch.no_grad(): # Disable gradient calculation during inference
            y_test_pred_scaled_tensor = model(X_test_tensor)

        # Inverse transform the scaled predictions and actual values to their original scale
        y_test_pred_real = scaler.inverse_transform(y_test_pred_scaled_tensor.numpy())
        y_test_actual_real = scaler.inverse_transform(y_test_actual)

        # Calculate deviations between actual and predicted values
        # Position deviation in meters (approx. 1 degree latitude/longitude ~ 111 km)
        position_deviation = np.linalg.norm(y_test_actual_real[:, :2] - y_test_pred_real[:, :2], axis=1) * 111000
        # Speed Over Ground (SOG) deviation in knots
        sog_deviation = np.abs(y_test_actual_real[:, 2] - y_test_pred_real[:, 2])

        # Define anomaly thresholds
        POS_THRESHOLD_M = 5000 # 5000 meters deviation
        SOG_THRESHOLD_KN = 5   # 5 knots deviation

        print(f"\n--- Anomaly Report for TripID {test_trip_id} ---")
        # Report anomalies based on defined thresholds
        anomaly_indices = []
        for i in range(len(position_deviation)):
            if position_deviation[i] > POS_THRESHOLD_M or sog_deviation[i] > SOG_THRESHOLD_KN:
                anomaly_indices.append(i + SEQ_LENGTH) # Add original index offset by SEQ_LENGTH
                print(f"Time Step {i+SEQ_LENGTH}: **ANOMALY DETECTED** "
                      f"(Pos Dev: {position_deviation[i]:.0f}m, SOG Dev: {sog_deviation[i]:.1f}kn)")
            else:
                print(f"Time Step {i+SEQ_LENGTH}: Normal")
        print(f"Total anomalies detected: {len(anomaly_indices)}")

        # Visualize the test trip with detected anomalies
        fig = visualize_trip_interactive(test_df, test_trip_id, anomaly_indices=anomaly_indices)
        fig.write_html(f"data/test_trip_{test_trip_id}_bigru_anomalies.html", include_plotlyjs='cdn')
        print(f"Interactive trip visualization with anomalies saved to data/test_trip_{test_trip_id}_bigru_anomalies.html")
    else:
        print("Not enough test data to create sequences for anomaly detection.")

else:
    print("Test trajectory data (or normal_df) is empty. Skipping anomaly detection demonstration.")

