In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import joblib
import numpy as np
from dtaidistance import dtw

# Import the Douglas-Peucker function from the simplification library
from simplification.cutil import simplify_coords

### Loading the data for one type of trip

In [None]:
df = pd.read_parquet("../data/fix_noise.parquet")
df["time_stamp"] = pd.to_datetime(df["time_stamp"])
df = df.sort_values(by=["time_stamp", "trip_id"])

df = df[df["start_port"] == "BREMERHAVEN"]
df.value_counts("trip_id")

# Clustering with DBSCAN and DTW

We comperess the trajectories to reduce the number of points per trajectory.
As later we will use DTW to calculate the distance between trajectories, this will speed up the process.

In [None]:
# Group Data into Trajectories
trajectories = {trip_id: group for trip_id, group in df.groupby('trip_id')}
print(f"Found {len(trajectories)} unique trajectories.")

# Compress Trajectories using the 'simplification' library
print("\nCompressing trajectories...")
compressed_trajectories = {}
for trip_id, traj_df in trajectories.items():
    points = traj_df[['latitude', 'longitude']].values
    compressed_points = simplify_coords(points, epsilon=0.000001) # sensitivity parameter
    compressed_trajectories[trip_id] = compressed_points
print("Trajectory compression complete.")
joblib.dump(compressed_trajectories, "data/compressed_trajectories.pkl")

In [None]:
compressed_trajectories = joblib.load("data/compressed_trajectories.pkl")

In [None]:

# --- 2. Clustering with Standard DTW ---
print("\n--- Part 2: Clustering with Standard DTW ---")
trajectories_list = list(compressed_trajectories.values()) # Convert the dictionary of compressed trajectories to a list

# Use the dtaidistance library to compute the distance matrix using standard DTW
print("Calculating pairwise distances using standard DTW...")
distance_matrix = dtw.distance_matrix(trajectories_list, use_c=True, show_progress=True)
print("Distance matrix calculation complete.")
joblib.dump(distance_matrix, "data/distance_matrix.pkl")

In [None]:
distance_matrix = joblib.load("data/distance_matrix.pkl")

In [None]:
# Run DBSCAN Clustering
# NOTE: The 'eps' parameter is highly sensitive. You will likely need to tune this value.

print("Running DBSCAN clustering...")
dbscan = DBSCAN(eps=5.0, min_samples=5, metric='precomputed')
cluster_labels = dbscan.fit_predict(distance_matrix)

In [None]:

# Identify the "normal" cluster (the largest one)
trip_ids = list(compressed_trajectories.keys())
normal_cluster_id_series = pd.Series(cluster_labels[cluster_labels!=-1]).mode()
if not normal_cluster_id_series.empty:
    normal_cluster_id = normal_cluster_id_series[0]
    normal_trip_ids = [trip_id for trip_id, label in zip(trip_ids, cluster_labels) if label == normal_cluster_id]
    print(f"\nIdentified normal traffic pattern as Cluster ID: {normal_cluster_id}")
else:
    normal_trip_ids = []
    print("\nNo significant clusters found. Cannot proceed.")

### Plotting the clusters

In [None]:
from utils import visualize_trajectory_clusters_interactive_fancy

In [None]:
fig = visualize_trajectory_clusters_interactive_fancy(compressed_trajectories, trip_ids, cluster_labels)
fig.write_html("data/trajectories_clusters.html", include_plotlyjs='cdn')

# Anomaly Detection with PyTorch BiGRU

Here we are going to use a BiGRU model to detect anomalies in the trajectories.
Is will detect exact measurement that is not fitting the normal pattern.

In [None]:
def create_sequences(data, seq_length):
    """
    Creates sequences from time-series data for training recurrent neural networks.

    Args:
        data (np.array): The input time-series data.
        seq_length (int): The length of each input sequence.

    Returns:
        tuple: A tuple containing two numpy arrays:
               - xs (np.array): Input sequences.
               - ys (np.array): Target values (the next step after the sequence).
    """
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i : i + seq_length])
        ys.append(data[i + seq_length])
    return np.array(xs), np.array(ys)


In [None]:
class BiGRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiGRUModel, self).__init__()
        # Bidirectional GRU layer
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        # Fully connected layer to map GRU output to desired output size
        self.fc = nn.Linear(hidden_size * 2, output_size) # hidden_size * 2 for bidirectional

    def forward(self, x):
        # Pass input through GRU layer
        out, _ = self.gru(x)
        # Get the output from the last time step and pass through the fully connected layer
        out = self.fc(out[:, -1, :])
        return out

    def load_model(self, path):
        try:
            self.load_state_dict(torch.load(path))
            self.eval()  # Set the model to evaluation mode after loading
            print(f"Model loaded successfully from {path}")
        except FileNotFoundError:
            print(f"Error: Model file not found at {path}")
        except Exception as e:
            print(f"An error occurred while loading the model: {e}")

In [None]:
NUM_EPOCHS = 10 # Using 10 epochs for demonstration
features = ['latitude', 'longitude', 'speed_over_ground', 'course_over_ground']
SEQ_LENGTH = 20 # Define sequence length and create sequences for training
BATCH_SIZE = 32 # Batch size for DataLoader
model = BiGRUModel(input_size=len(features), hidden_size=128, num_layers=1, output_size=len(features))
scaler = MinMaxScaler()


In [None]:
if normal_trip_ids:
    print("Preparing training data from normal cluster...")
    # Filter for normal trips and sort by time for sequence creation
    normal_df = df[df['trip_id'].isin(normal_trip_ids)].sort_values(by='time_stamp')
    training_data = normal_df[features].values

    # Scale features to a range between 0 and 1
    training_data_scaled = scaler.fit_transform(training_data)


In [None]:

if normal_trip_ids:
    X_train, y_train = create_sequences(training_data_scaled, SEQ_LENGTH)

    # Convert numpy arrays to PyTorch tensors
    X_train_tensor = torch.from_numpy(X_train).float()
    y_train_tensor = torch.from_numpy(y_train).float()

    # Create DataLoader for batching and shuffling
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    print("Building and training the BiGRU model...")
    # Instantiate the model, define loss function and optimizer
    loss_function = nn.MSELoss()  # Mean Squared Error Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam optimizer

    # Train the model
    for epoch in range(NUM_EPOCHS):
        for sequences, labels in train_loader:
            outputs = model(sequences)
            loss = loss_function(outputs, labels)

            optimizer.zero_grad() # Clear gradients
            loss.backward()       # Backpropagation
            optimizer.step()      # Update weights
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}")
    print("Model training complete.")

In [None]:
print("\nDetecting anomalies in a test trajectory...")
# Select a test trip ID, prioritizing a non-normal one if available
test_trip_id = next((tid for tid, label in zip(trip_ids, cluster_labels) if label != normal_cluster_id), trip_ids[-1])
test_df = trajectories[test_trip_id]
test_data = test_df[features].values

# Create a fake anomaly for demonstration purposes: sudden increase in Speed Over Ground (SOG)
test_data[20:25, 2] = 50

# Scale the test data using the same scaler fitted on training data
test_data_scaled = scaler.transform(test_data)
X_test, y_test_actual = create_sequences(test_data_scaled, SEQ_LENGTH)
X_test_tensor = torch.from_numpy(X_test).float()

# Set model to evaluation mode
model.eval()
with torch.no_grad(): # Disable gradient calculation during inference
    y_test_pred_scaled_tensor = model(X_test_tensor)

# Inverse transform the scaled predictions and actual values to their original scale
y_test_pred_real = scaler.inverse_transform(y_test_pred_scaled_tensor.numpy())
y_test_actual_real = scaler.inverse_transform(y_test_actual)

# Calculate deviations
# Position deviation in meters (approx. 1 degree latitude/longitude ~ 111 km)
position_deviation = np.linalg.norm(y_test_actual_real[:, :2] - y_test_pred_real[:, :2], axis=1) * 111000
# Speed Over Ground (SOG) deviation in knots
sog_deviation = np.abs(y_test_actual_real[:, 2] - y_test_pred_real[:, 2])

# Define anomaly thresholds
POS_THRESHOLD_M = 5000 # 5000 meters
SOG_THRESHOLD_KN = 5   # 5 knots

print(f"\n--- Anomaly Report for TripID {test_trip_id} ---")
# Report anomalies based on thresholds
for i in range(len(position_deviation)):
    if position_deviation[i] > POS_THRESHOLD_M or sog_deviation[i] > SOG_THRESHOLD_KN:
        print(f"Time Step {i+SEQ_LENGTH}: **ANOMALY DETECTED** "
              f"(Pos Dev: {position_deviation[i]:.0f}m, SOG Dev: {sog_deviation[i]:.1f}kn)")
    else:
        print(f"Time Step {i+SEQ_LENGTH}: Normal")

In [None]:
torch.save(model.state_dict(), "data/bigru_model.pth")

In [None]:
loaded_model = BiGRUModel(input_size=len(features), hidden_size=128, num_layers=1, output_size=len(features))
loaded_model.load_model("data/bigru_model.pth")
model = loaded_model

## For testing if model works correctly

In [None]:
from utils import visualize_trip_interactive

In [16]:
import random


test_trip_pool = [tid for tid, label in zip(trip_ids, cluster_labels) if label != normal_cluster_id]
if not test_trip_pool:
    print("Warning: No trips outside the normal cluster were found. Testing on a 'normal' trip for demonstration.")
    test_trip_pool = normal_trip_ids

test_trip_id = random.choice(test_trip_pool)
print(f"Randomly selected Trip ID {test_trip_id} for testing.")
test_df = df[df['trip_id'] == test_trip_id].sort_values(by='time_stamp')

# Step 4
test_data = test_df[features].values
test_data_scaled = scaler.transform(test_data) # Use the SAME scaler from training
X_test, y_test_actual = create_sequences(test_data_scaled, SEQ_LENGTH)
X_test_tensor = torch.from_numpy(X_test).float()

model.eval()
with torch.no_grad():
    y_test_pred_scaled_tensor = model(X_test_tensor)

y_test_pred_real = scaler.inverse_transform(y_test_pred_scaled_tensor.numpy())
y_test_actual_real = scaler.inverse_transform(y_test_actual)


# Step 4d: Detect Anomalies based on deviation
position_deviation = np.linalg.norm(y_test_actual_real[:, :2] - y_test_pred_real[:, :2], axis=1) * 111000
sog_deviation = np.abs(y_test_actual_real[:, 2] - y_test_pred_real[:, 2])
anomaly_indices = np.where((position_deviation > 3000) | (sog_deviation > 1))[0] + SEQ_LENGTH # Threshold of 1km

Randomly selected Trip ID 1130240 for testing.


In [17]:
fig = visualize_trip_interactive(test_df, test_trip_id)
fig.write_html(f"data/test_trip{test_trip_id}.html", include_plotlyjs='cdn')


--- Visualizing Trip ID: 1130240 (Interactive Map) ---
Return interactive map for Trip ID 1130240.
