<a href="https://colab.research.google.com/github/adadoun/KplerNextDestination/blob/main/NNDestinationProbabilityPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from category_encoders import TargetEncoder
import lightgbm as lgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# PyTorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')

## Load train/test data

In [5]:
# Load Data
train_df = pd.read_csv('drive/MyDrive/Collab_DATA/KplerData/train_trades_csv_prepared.csv')
test_df = pd.read_csv('drive/MyDrive/Collab_DATA/KplerData/test_trades_csv_prepared.csv')

## Define Utilities functions

 1. torch.nn.Module: This is the base class for all neural network modules in PyTorch.

 2. torch.utils.data.Dataset: This is an abstract class representing a dataset.
    Custom datasets should subclass this and implement __len__ and __getitem__.

 3. torch.nn.Embedding: This layer is used to learn dense vector representations of discrete variables.

 4. torch.nn.Sequential: This is a container that allows for easy sequential composition of layers.

 5. model.train() and model.eval(): These methods set the mode of the model, which affects
    the behavior of certain layers (like Dropout and BatchNorm) during training and inference.

 6. optimizer.zero_grad(): This resets the gradients of all parameters to zero before the backward pass.

 7. loss.backward(): This computes the gradient of the loss with respect to the parameters.

 8. optimizer.step(): This updates the parameters based on the computed gradients.

 9. torch.no_grad(): This context manager disables gradient calculation, which reduces memory usage
    and speeds up computations when you don't need gradients (like during validation/testing).


In [6]:
# Preprocessing function
def preprocess_data(df: pd.DataFrame, is_train: bool = True, cat_encodings: dict = None, scaler: StandardScaler = None) -> tuple:
    """
    Preprocess the data by encoding categorical variables and scaling numerical variables.

    Args:
        df (pd.DataFrame): Input dataframe
        is_train (bool): Flag to indicate if this is training data
        cat_encodings (dict): Dictionary of categorical encodings (if not training)
        scaler (StandardScaler): Fitted StandardScaler object (if not training)

    Returns:
        tuple: Processed dataframe, categorical encodings, scaler, categorical column names, numerical column names
    """
    # Define categorical and numerical columns
    cat_cols = ['vessel_id', 'origin', 'destination', 'origin_h3_res2_index', 'destination_h3_res2_index',
                'product_family', 'vessel_type', 'products', 'flag_name', 'origin_country_code',
                'destination_country_code', 'previous_visited_port_1', 'previous_visited_port_2',
                'previous_visited_port_3', 'probability_level']
    num_cols = ['day_of_week', 'month', 'traded_volume', 'dead_weight', 'vessel_age',
                'origin_draught_change', 'destination_draught_change', 'origin_cargo_volume',
                'destination_cargo_volume', 'od_distance', 'p1_destination_probability',
                'p2_destination_probability', 'p3_destination_probability', 'p4_destination_probability',
                'merged_destination_probability']

    if is_train:
        # Create categorical encodings for training data
        cat_encodings = {col: defaultdict(lambda: 0, {val: i + 1 for i, val in enumerate(df[col].unique())}) for col in cat_cols}
        for col in cat_encodings:
            cat_encodings[col]['<unknown>'] = len(cat_encodings[col])
        # Fit and transform numerical columns
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols].fillna(0))
    else:
        # Transform numerical columns using pre-fitted scaler
        df[num_cols] = scaler.transform(df[num_cols].fillna(0))

    # Encode categorical columns
    for col in cat_cols:
        df[col] = df[col].map(lambda x: cat_encodings[col][x] if x in cat_encodings[col] else cat_encodings[col]['<unknown>'])

    return df, cat_encodings, scaler, cat_cols, num_cols

# Dataset class
class VesselDataset(Dataset):
    """
    Custom Dataset class for vessel data.

    Attributes:
        features (pd.DataFrame): Feature data
        labels (pd.Series): Labels (optional)
    """

    def __init__(self, features: pd.DataFrame, labels: pd.Series = None):
        """
        Initialize the dataset.

        Args:
            features (pd.DataFrame): Feature data
            labels (pd.Series, optional): Labels
        """
        self.features = features
        self.labels = labels

    def __len__(self) -> int:
        """
        Get the length of the dataset.

        Returns:
            int: Number of samples in the dataset
        """
        return len(self.features)

    def __getitem__(self, idx: int) -> tuple:
        """
        Get a sample from the dataset.

        Args:
            idx (int): Index of the sample

        Returns:
            tuple: Features and label (if available) for the sample
        """
        if self.labels is not None:
            return torch.tensor(self.features.iloc[idx].values, dtype=torch.float), torch.tensor(self.labels.iloc[idx], dtype=torch.float)
        return torch.tensor(self.features.iloc[idx].values, dtype=torch.float)

# Model definition
class VesselDestinationModel(nn.Module):
    """
    Neural network model for vessel destination prediction.

    Attributes:
        embeddings (nn.ModuleList): List of embedding layers for categorical features
        main_network (nn.Sequential): Main network layers
        final_layer (nn.Linear): Final linear layer
        activation (nn.Sigmoid): Sigmoid activation function
    """

    def __init__(self, cat_dims: list, num_dims: int, embedding_dims: list = [128, 128, 128, 64, 64, 10, 10, 10, 10, 10, 10, 128, 128, 128, 1],
                 hidden_dims: list = [1024, 512, 256, 128, 64, 32]):
        """
        Initialize the model.

        Args:
            cat_dims (list): Dimensions of categorical features
            num_dims (int): Number of numerical features
            embedding_dims (list): Dimensions for embeddings
            hidden_dims (list): Dimensions of hidden layers
        """
        super(VesselDestinationModel, self).__init__()

        # Create embedding layers for each categorical feature
        self.embeddings = nn.ModuleList([nn.Embedding(dim, embedding_dims[i]) for i, dim in enumerate(cat_dims)])

        total_embedding_dim = sum(embedding_dims)
        self.num_features = num_dims
        total_input_dim = total_embedding_dim + self.num_features

        # Create main network layers
        layers = []
        input_dim = total_input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            input_dim = hidden_dim

        layers.append(nn.Linear(hidden_dims[-1], 1))

        self.main_network = nn.Sequential(*layers)

        # Final layer for combining main output and element-wise products
        self.final_layer = nn.Linear(3, 1)
        self.activation = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor

        Returns:
            torch.Tensor: Output predictions
        """
        # Split input into categorical and numerical features
        cat_features = x[:, :len(self.embeddings)].long()
        num_features = x[:, len(self.embeddings):]

        # Apply embeddings to categorical features
        embedded = [emb(cat_features[:, i]) for i, emb in enumerate(self.embeddings)]

        # Extract specific embeddings
        vessel_id_emb = embedded[0]
        origin_emb = embedded[1]
        destination_emb = embedded[2]

        # Compute element-wise products
        origin_dest_product = (origin_emb * destination_emb).sum(dim=1, keepdim=True)
        vessel_dest_product = (vessel_id_emb * destination_emb).sum(dim=1, keepdim=True)

        # Concatenate all embeddings
        embedded = torch.cat(embedded, dim=1)

        # Concatenate embeddings with numerical features
        x = torch.cat([embedded, num_features], dim=1)

        # Pass through main network
        main_output = self.main_network(x)

        # Concatenate main output with element-wise products
        final_input = torch.cat([main_output, origin_dest_product, vessel_dest_product], dim=1)

        # Final layer and activation
        output = self.activation(self.final_layer(final_input))

        return output.squeeze()

# Training function
def train_model(model: nn.Module, train_loader: torch.utils.data.DataLoader, val_loader: torch.utils.data.DataLoader,
                criterion: nn.Module, optimizer: torch.optim.Optimizer, num_epochs: int, device: torch.device) -> None:
    """
    Train the model.

    Args:
        model (nn.Module): The model to train
        train_loader (DataLoader): DataLoader for training data
        val_loader (DataLoader): DataLoader for validation data
        criterion (nn.Module): Loss function
        optimizer (torch.optim.Optimizer): Optimizer
        num_epochs (int): Number of epochs to train
        device (torch.device): Device to train on (CPU or GPU)

    Returns:
        None
    """
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        train_loss = 0
        train_correct = 0
        train_total = 0

        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()  # Zero the parameter gradients
            outputs = model(features)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            train_loss += loss.item()
            predicted = (outputs > 0.5).float()  # Binary classification threshold
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        train_loss /= len(train_loader)
        train_accuracy = 100 * train_correct / train_total

        # Validation
        model.eval()  # Set model to evaluation mode
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():  # Disable gradient computation
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                predicted = (outputs > 0.5).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = 100 * val_correct / val_total

        # Print epoch results
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')
        print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
        print()

print("Libraries imported and functions defined successfully.")

Libraries imported and functions defined successfully.


## Train/Val Split using temporale split as we are using time series data

In [7]:
# Preprocess training data
train_df, cat_encodings, scaler, cat_cols, num_cols = preprocess_data(train_df, is_train=True)

# Split into train and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Split the data
# Sort the dataframe by end_date_time
df_sorted = train_df.sort_values('start_date_time')

# Determine the split point (90% train, 10% validation)
split_index = int(len(df_sorted) * 0.95)

# Split the data
train_df = df_sorted.iloc[:split_index]
val_df = df_sorted.iloc[split_index:]



print("Training data preprocessed and split.")
print(f"Shape of training data: {train_data.shape}")
print(f"Shape of validation data: {val_data.shape}")
print(f"Training data date range: {train_df['start_date_time'].min()} to {train_df['start_date_time'].max()}")
print(f"Validation data date range: {val_df['start_date_time'].min()} to {val_df['start_date_time'].max()}")


# Create datasets and dataloaders
train_dataset = VesselDataset(train_data[cat_cols + num_cols], train_data['is_visit'])
val_dataset = VesselDataset(val_data[cat_cols + num_cols], val_data['is_visit'])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)

print("Datasets and DataLoaders created.")
print(f"Number of batches in train_loader: {len(train_loader)}")
print(f"Number of batches in val_loader: {len(val_loader)}")


Training data preprocessed and split.
Shape of training data: (476392, 33)
Shape of validation data: (119098, 33)
Training data date range: 2023-01-01T00:35:13.000Z to 2023-10-07T04:39:12.000Z
Validation data date range: 2023-10-07T04:39:12.000Z to 2023-10-30T18:35:58.000Z
Datasets and DataLoaders created.
Number of batches in train_loader: 1861
Number of batches in val_loader: 466


## Define and print the model

In [9]:
# Initialize the model
cat_dims = [len(cat_encodings[col]) for col in cat_cols]
num_dims = len(num_cols)
model = VesselDestinationModel(cat_dims, num_dims)

# Training setup
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model initialized. Using device: {device}")
print(model)

Model initialized. Using device: cuda
VesselDestinationModel(
  (embeddings): ModuleList(
    (0): Embedding(8394, 128)
    (1): Embedding(1343, 128)
    (2): Embedding(1549, 128)
    (3): Embedding(501, 64)
    (4): Embedding(601, 64)
    (5): Embedding(12, 10)
    (6): Embedding(13, 10)
    (7): Embedding(2299, 10)
    (8): Embedding(94, 10)
    (9): Embedding(132, 10)
    (10): Embedding(152, 10)
    (11): Embedding(1535, 128)
    (12): Embedding(1512, 128)
    (13): Embedding(1490, 128)
    (14): Embedding(6, 1)
  )
  (main_network): Sequential(
    (0): Linear(in_features=972, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): ReLU()
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): ReLU()
    (10): Linear(in_features=64, out_features=32, bia

## Train the model using predefined hyper-parameters (no hyper optimization is performed for this POC)

In [10]:
num_epochs = 3
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)

print("Model training completed.")

Epoch 1/3:
Train Loss: 0.3293, Train Accuracy: 90.49%
Val Loss: 0.1198, Val Accuracy: 95.55%

Epoch 2/3:
Train Loss: 0.0800, Train Accuracy: 97.17%
Val Loss: 0.0791, Val Accuracy: 97.18%

Epoch 3/3:
Train Loss: 0.0449, Train Accuracy: 98.38%
Val Loss: 0.0726, Val Accuracy: 97.51%

Model training completed.


## Model Evaluation

### Preprocess and score the model on test data

In [12]:
# Preprocess the sampled test data
test_df_sample, _, _, _, _ = preprocess_data(test_df, is_train=False, cat_encodings=cat_encodings, scaler=scaler)

# Create test dataset and dataloader
test_dataset = VesselDataset(test_df_sample[cat_cols + num_cols])
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

print("Test data preprocessed.")
print(f"Shape of preprocessed test data: {test_df_sample.shape}")

Test data preprocessed.
Shape of preprocessed test data: (2191050, 33)


In [13]:
model.eval()
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        predictions.extend(outputs.cpu().numpy())

# Add predictions to test_df_sample
test_df_sample['predicted_probability'] = predictions


### Build Ranked Destinations list for each vessel

In [14]:
# Step 1: Prepare the data
ranked_destinations = test_df_sample[test_df_sample.is_visit == 0].groupby('vessel_id').apply(
    lambda x: x.sort_values('predicted_probability', ascending=False)
).reset_index(drop=True)

actual_destinations = test_df_sample[test_df_sample['is_visit'] == 1].set_index('vessel_id')['destination'].to_dict()


### Define utilities functions for evaluation

In [15]:
def calculate_reciprocal_rank(group: pd.DataFrame) -> float:
    """
    Calculate the reciprocal rank for a group of predictions for a single vessel.

    The reciprocal rank is 1/rank if the actual destination is in the predictions,
    or 0 if it's not present.

    Args:
        group (pd.DataFrame): A DataFrame containing predictions for a single vessel,
                              sorted by prediction probability in descending order.

    Returns:
        float: The reciprocal rank of the actual destination in the predictions.
    """
    # Get the vessel ID from the first row of the group
    vessel_id = group['vessel_id'].iloc[0]

    # Get the actual destination for this vessel from a pre-defined dictionary
    actual_dest = actual_destinations.get(vessel_id)

    # If there's no actual destination for this vessel, return 0
    if actual_dest is None:
        return 0

    # Find the rows where the predicted destination matches the actual destination
    actual_dest_rows = group[group['destination'] == actual_dest]

    # If the actual destination is not in the predictions, return 0
    if actual_dest_rows.empty:
        return 0

    # Get the rank of the actual destination (adding 1 because index starts at 0)
    rank = group.index.get_loc(actual_dest_rows.index[0]) + 1

    # Return the reciprocal of the rank
    return 1 / rank

def is_in_top_n(group: pd.DataFrame, n: int) -> bool:
    """
    Check if the actual destination is in the top N predictions for a vessel.

    Args:
        group (pd.DataFrame): A DataFrame containing predictions for a single vessel,
                              sorted by prediction probability in descending order.
        n (int): The number of top predictions to consider.

    Returns:
        bool: True if the actual destination is in the top N predictions, False otherwise.
    """
    # Calculate the reciprocal rank for this group
    reciprocal_rank = calculate_reciprocal_rank(group)

    # Check if the reciprocal rank is greater than or equal to 1/n
    # This is equivalent to checking if the rank is less than or equal to n
    return reciprocal_rank >= 1/n if reciprocal_rank > 0 else False

### Compute accuracies and MRR metrics

In [16]:
# Step 3: Calculate metrics
mrr = ranked_destinations.groupby('vessel_id').apply(calculate_reciprocal_rank).mean()
top_1_accuracy = ranked_destinations.groupby('vessel_id').apply(lambda x: is_in_top_n(x, 1)).mean()
top_3_accuracy = ranked_destinations.groupby('vessel_id').apply(lambda x: is_in_top_n(x, 3)).mean()
top_10_accuracy = ranked_destinations.groupby('vessel_id').apply(lambda x: is_in_top_n(x, 10)).mean()

print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Top 1 Accuracy: {top_1_accuracy:.4f}")
print(f"Top 3 Accuracy: {top_3_accuracy:.4f}")
print(f"Top 10 Accuracy: {top_10_accuracy:.4f}")


Mean Reciprocal Rank (MRR): 0.6560
Top 1 Accuracy: 0.5668
Top 3 Accuracy: 0.7386
Top 10 Accuracy: 0.7888


### Plot Accuracy evolution for different top-k predictions

In [17]:
# Step 4: Plot accuracy evolution from top 1 to top 10
accuracies = [ranked_destinations.groupby('vessel_id').apply(lambda x: is_in_top_n(x, i)).mean() for i in range(1, 11)]

fig1 = go.Figure(data=go.Scatter(
    x=list(range(1, 11)),
    y=accuracies,
    mode='lines+markers+text',
    text=[f'{acc:.2%}' for acc in accuracies],
    textposition='top center'
))
fig1.update_layout(
    title='Accuracy Evolution: Top 1 to Top 10',
    xaxis_title='Top N Predictions',
    yaxis_title='Accuracy',
    yaxis_tickformat='.0%',
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)
fig1.show()


### Plot accuracy evolution over the number of past trips

In [18]:
# Step 6: Accuracy evolution by number of past trips
train_trips = train_df[train_df['is_visit'] == 1]['vessel_id'].value_counts().reset_index()
train_trips.columns = ['vessel_id', 'num_past_trips']

combined_data = pd.merge(ranked_destinations, train_trips, on='vessel_id', how='left')
combined_data = combined_data[combined_data['num_past_trips'] <= 40]

accuracy_by_past_trips = combined_data.groupby(['num_past_trips', 'vessel_id']).apply(lambda x: is_in_top_n(x, 1)).groupby('num_past_trips').mean().reset_index()
accuracy_by_past_trips.columns = ['num_past_trips', 'top_1_accuracy']

fig3 = go.Figure()

fig3.add_trace(go.Scatter(
    x=accuracy_by_past_trips['num_past_trips'],
    y=accuracy_by_past_trips['top_1_accuracy'],
    mode='lines+markers+text',
    name='Top 1 Accuracy',
    text=[f'{acc:.2%}' for acc in accuracy_by_past_trips['top_1_accuracy']],
    textposition='top center'
))

fig3.update_layout(
    title='Evolution of Top 1 Accuracy by Number of Past Trips (Up to 40 trips)',
    xaxis_title='Number of Past Trips',
    yaxis_title='Top 1 Accuracy',
    yaxis_tickformat='.0%',
    xaxis=dict(tickmode='linear', tick0=0, dtick=1)
)

fig3.show()



In [20]:
def get_original_labels(encoded_values: Dict[Any, int], encoding_dict: Dict[Any, int]) -> Dict[int, Any]:
    """
    Reverse the encoding dictionary to get original labels.

    Args:
        encoded_values (Dict[Any, int]): Dictionary of encoded values.
        encoding_dict (Dict[Any, int]): Original encoding dictionary.

    Returns:
        Dict[int, Any]: Dictionary mapping encoded values back to original labels.
    """
    return {v: k for k, v in encoding_dict.items()}

# Get original labels for vessel types
vessel_type_labels = get_original_labels(cat_encodings['vessel_type'], cat_encodings['vessel_type'])
# Get original labels for product families
product_family_labels = get_original_labels(cat_encodings['product_family'], cat_encodings['product_family'])

def compute_top_1_accuracy_by_dimension(dimension: str, top_n: int, label_dict: Dict[int, Any]) -> pd.Series:
    """
    Compute top 1 accuracy for different categories within a specified dimension.

    Args:
        dimension (str): The column name of the dimension to analyze.
        top_n (int): The number of top categories to consider.
        label_dict (Dict[int, Any]): Dictionary to map encoded values back to original labels.

    Returns:
        pd.Series: A series containing the top 1 accuracy for each category.
    """
    # Get the top N categories for the specified dimension
    top_categories = ranked_destinations[dimension].value_counts().nlargest(top_n).index

    accuracies = {}
    for category in top_categories:
        # Filter the dataset for the current category
        subset = ranked_destinations[ranked_destinations[dimension] == category]
        # Compute accuracy for this category
        accuracy = subset.groupby('vessel_id').apply(lambda x: is_in_top_n(x, 1)).mean()
        # Get the original label for this category
        label = label_dict.get(category, f"Unknown ({category})")
        accuracies[label] = accuracy

    return pd.Series(accuracies)

# Compute top 1 accuracy for top 4 vessel types
vessel_type_accuracy = compute_top_1_accuracy_by_dimension('vessel_type', 4, vessel_type_labels)
# Compute top 1 accuracy for top 5 product families
product_family_accuracy = compute_top_1_accuracy_by_dimension('product_family', 5, product_family_labels)

# Create a subplot figure with two rows
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=('Top 1 Accuracy by Vessel Type', 'Top 1 Accuracy by Product Family'),
                    vertical_spacing=0.3)  # Increase vertical spacing between subplots

# Add bar chart for vessel type accuracy
fig.add_trace(go.Bar(
    x=vessel_type_accuracy.index,
    y=vessel_type_accuracy.values,
    text=[f'{acc:.2%}' for acc in vessel_type_accuracy.values],
    textposition='auto',
    marker_color='royalblue'
), row=1, col=1)

# Add bar chart for product family accuracy
fig.add_trace(go.Bar(
    x=product_family_accuracy.index,
    y=product_family_accuracy.values,
    text=[f'{acc:.2%}' for acc in product_family_accuracy.values],
    textposition='auto',
    marker_color='lightsalmon'
), row=2, col=1)

# Update layout of the figure
fig.update_layout(
    height=1000,  # Increase overall height of the figure
    title_text="Top 1 Accuracy by Vessel Type and Product Family",
    showlegend=False
)

# Update x-axis labels
fig.update_xaxes(title_text="Vessel Type", row=1, col=1, tickangle=45)
fig.update_xaxes(title_text="Product Family", row=2, col=1, tickangle=45)

# Update y-axis labels and format
fig.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', range=[0, 1], row=1, col=1)
fig.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', range=[0, 1], row=2, col=1)

# Ensure y-axis starts at 0 and ends at 100%
fig.update_yaxes(range=[0, 1])

# Display the figure
fig.show()

# Print additional statistics
print("\nTop 1 Accuracy by Vessel Type:")
print(vessel_type_accuracy)
print("\nTop 1 Accuracy by Product Family:")
print(product_family_accuracy)

# Calculate and print overall statistics
print("\nOverall Statistics:")
print(f"Average Top 1 Accuracy across Vessel Types: {vessel_type_accuracy.mean():.2%}")
print(f"Average Top 1 Accuracy across Product Families: {product_family_accuracy.mean():.2%}")
print(f"Vessel Type with highest accuracy: {vessel_type_accuracy.idxmax()} ({vessel_type_accuracy.max():.2%})")
print(f"Product Family with highest accuracy: {product_family_accuracy.idxmax()} ({product_family_accuracy.max():.2%})")


Top 1 Accuracy by Vessel Type:
Crude/Oil Products Tanker       0.581787
Chemical/Oil Products Tanker    0.524555
Products Tanker                 0.573770
Crude Oil Tanker                0.594436
dtype: float64

Top 1 Accuracy by Product Family:
clean petroleum products    0.542557
chem/bio                    0.550938
crude oil/condensate        0.634290
dirty petroleum products    0.559211
minor bulks                 0.500000
dtype: float64

Overall Statistics:
Average Top 1 Accuracy across Vessel Types: 56.86%
Average Top 1 Accuracy across Product Families: 55.74%
Vessel Type with highest accuracy: Crude Oil Tanker (59.44%)
Product Family with highest accuracy: crude oil/condensate (63.43%)
