<a href="https://colab.research.google.com/github/adadoun/KplerNextDestination/blob/main/AssociationRuleTechnique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# library Import
import pandas as pd
from collections import defaultdict
import numpy as np
import pandas as pd
import numpy as np
from collections import defaultdict
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import List, Tuple

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load the training data
train_df = pd.read_csv('drive/MyDrive/Collab_DATA/KplerData/train_trades_csv_prepared.csv')
test_df = pd.read_csv('drive/MyDrive/Collab_DATA/KplerData/test_trades_csv_prepared.csv')

## Create Ranking destinations based on association rules probabilities


In [7]:
# Create a DataFrame with predictions
ranked_destinations = test_df[['vessel_id', 'destination', 'is_visit', 'vessel_type', 'product_family']].copy()
ranked_destinations['predicted_probability'] = test_df['merged_destination_probability']

# Filter predictions (is_visit = 0) and actual destinations (is_visit = 1)
ranked_destinations_filtered = ranked_destinations[ranked_destinations['is_visit'] == 0]
test_df_sample = ranked_destinations[ranked_destinations['is_visit'] == 1]

# Create a dictionary of actual destinations
actual_destinations = test_df_sample.set_index('vessel_id')['destination'].to_dict()

## Create Utilities Functions

In [8]:
def get_top_n_predictions(group: pd.DataFrame, n: int) -> List[str]:
    """
    Get the top N predictions for a group of destinations based on predicted probability.

    Args:
        group (pd.DataFrame): A DataFrame containing 'destination' and 'predicted_probability' columns.
        n (int): The number of top predictions to return.

    Returns:
        List[str]: A list of the top N predicted destinations.
    """
    return group.nlargest(n, 'predicted_probability')['destination'].tolist()

def calculate_accuracy_and_ranks(n: int) -> Tuple[float, List[float]]:
    """
    Calculate accuracy and get ranks for Mean Reciprocal Rank (MRR) calculation.

    Args:
        n (int): The number of top predictions to consider.

    Returns:
        Tuple[float, List[float]]: A tuple containing the accuracy and a list of ranks for MRR.
    """
    # Group predictions by vessel_id and get top N predictions for each
    top_n_predictions = ranked_destinations_filtered.groupby('vessel_id').apply(lambda x: get_top_n_predictions(x, n))

    accuracies = []
    ranks = []
    for vessel_id, preds in top_n_predictions.items():
        actual_dest = actual_destinations.get(vessel_id)
        if actual_dest in preds:
            accuracies.append(1)
            ranks.append(1 / (preds.index(actual_dest) + 1))
        else:
            accuracies.append(0)
            ranks.append(0)

    return np.mean(accuracies), ranks

def calculate_accuracy_for_top_n(n: int) -> float:
    """
    Calculate accuracy for top N predictions.

    Args:
        n (int): The number of top predictions to consider.

    Returns:
        float: The accuracy of the top N predictions.
    """
    # Group predictions by vessel_id and get top N predictions for each
    top_n_predictions = ranked_destinations_filtered.groupby('vessel_id').apply(lambda x: get_top_n_predictions(x, n))

    # Calculate accuracy
    accuracy = sum(actual_destinations.get(vessel_id) in preds for vessel_id, preds in top_n_predictions.items()) / len(actual_destinations)

    return accuracy

def calculate_mrr() -> float:
    """
    Calculate the Mean Reciprocal Rank (MRR) for the predictions.

    Returns:
        float: The Mean Reciprocal Rank.
    """
    def get_reciprocal_rank(group: pd.DataFrame) -> float:
        """
        Calculate the reciprocal rank for a single group (vessel).

        Args:
            group (pd.DataFrame): A DataFrame containing predictions for a single vessel.

        Returns:
            float: The reciprocal rank for the vessel.
        """
        vessel_id = group.name
        actual_dest = actual_destinations.get(vessel_id)
        if actual_dest is None:
            return 0

        # Sort predictions by probability in descending order
        sorted_predictions = group.sort_values('predicted_probability', ascending=False).reset_index(drop=True)

        # Find the rank of the actual destination
        rank = sorted_predictions[sorted_predictions['destination'] == actual_dest].index.min()
        if pd.isna(rank):
            return 0

        # Calculate reciprocal rank (add 1 to rank because indexing starts at 0)
        return 1 / (rank + 1)

    # Calculate reciprocal ranks for all vessels
    reciprocal_ranks = ranked_destinations_filtered.groupby('vessel_id').apply(get_reciprocal_rank)

    # Print some debug information
    print(f"Number of vessels: {len(reciprocal_ranks)}")
    print(f"Number of non-zero ranks: {(reciprocal_ranks > 0).sum()}")
    print(f"Average reciprocal rank: {reciprocal_ranks.mean():.4f}")

    return reciprocal_ranks.mean()

## Compute Accuracy

In [9]:
# Calculate Top 1, Top 3, and Top 10 Accuracies
top_1_accuracy, ranks_1 = calculate_accuracy_and_ranks(1)
top_3_accuracy, _ = calculate_accuracy_and_ranks(3)
top_10_accuracy, _ = calculate_accuracy_and_ranks(10)

# Calculate Mean Reciprocal Rank
mrr = calculate_mrr()

# Print the results
print(f"Top 1 Accuracy: {top_1_accuracy:.4f}")
print(f"Top 3 Accuracy: {top_3_accuracy:.4f}")
print(f"Top 10 Accuracy: {top_10_accuracy:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

Number of vessels: 6246
Number of non-zero ranks: 5038
Average reciprocal rank: 0.3293
Top 1 Accuracy: 0.2147
Top 3 Accuracy: 0.3809
Top 10 Accuracy: 0.5621
Mean Reciprocal Rank (MRR): 0.3293


## Plot Evolution of the accuracy for different top K predictions

In [10]:
# Calculate accuracies for top 1 to top 10
accuracies = [calculate_accuracy_for_top_n(i) for i in range(1, 11)]

# 1. Plot accuracy evolution from top 1 to top 10
fig1 = go.Figure(data=go.Scatter(
    x=list(range(1, 11)),
    y=accuracies,
    mode='lines+markers+text',
    text=[f'{acc:.2%}' for acc in accuracies],
    textposition='top center'
))
fig1.update_layout(
    title='Accuracy Evolution: Top 1 to Top 10',
    xaxis_title='Top N Predictions',
    yaxis_title='Accuracy',
    yaxis_tickformat='.0%',
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)
fig1.show()

## Plot the evolution of accuracy (top 1) over the number of potential destinations

In [12]:
def calculate_top_1_accuracy(group):
    """
    Calculate the top 1 accuracy for a group of predictions.

    Args:
        group (pd.DataFrame): A group of predictions for a single vessel.

    Returns:
        bool: True if the top predicted destination matches the actual destination, False otherwise.
    """
    return actual_destinations.get(group.name) == group.nlargest(1, 'predicted_probability')['destination'].iloc[0]

# Calculate the number of potential destinations and top 1 accuracy for each vessel
ranked_destinations['num_potential_destinations'] = ranked_destinations.groupby('vessel_id')['vessel_id'].transform('count')
ranked_destinations['top_1_accuracy'] = ranked_destinations.groupby('vessel_id').apply(calculate_top_1_accuracy)

# Filter to include only up to 1000 potential destinations
ranked_destinations = ranked_destinations[ranked_destinations['num_potential_destinations'] <= 1000]

# Define bin parameters for grouping
bin_size = 100
max_samples = 1000
bins = range(0, max_samples + bin_size, bin_size)

# Group accuracy data into bins and calculate mean accuracy and count for each bin
accuracy_by_bin = ranked_destinations.groupby(pd.cut(ranked_destinations['num_potential_destinations'], bins=bins))['top_1_accuracy'].agg(['mean', 'count']).reset_index()
accuracy_by_bin['bin_midpoint'] = accuracy_by_bin['num_potential_destinations'].apply(lambda x: x.mid)

# Create subplots: line plot and histogram
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=("Line Plot: Top 1 Accuracy vs Number of Potential Destinations (up to 1000)",
                                    "Histogram: Distribution of Potential Destinations (up to 1000)"),
                    vertical_spacing=0.1)

# Add line plot for accuracy
fig.add_trace(
    go.Scatter(x=accuracy_by_bin['bin_midpoint'], y=accuracy_by_bin['mean'], mode='lines+markers',
               name='Top 1 Accuracy', text=[f'{acc:.2%}' for acc in accuracy_by_bin['mean']],
               hovertemplate='Potential Destinations: %{x}<br>Accuracy: %{text}<br>Count: %{customdata}',
               customdata=accuracy_by_bin['count']),
    row=1, col=1
)

# Add histogram for distribution of potential destinations
fig.add_trace(
    go.Bar(x=accuracy_by_bin['bin_midpoint'], y=accuracy_by_bin['count'],
           name='Sample Count', hovertemplate='Potential Destinations: %{x}<br>Count: %{y}'),
    row=2, col=1
)

# Update layout and axes
fig.update_layout(height=800, title_text="Top 1 Accuracy vs Number of Potential Destinations (up to 1000)")
fig.update_xaxes(title_text="Number of Potential Destinations", row=1, col=1, tickmode='array', tickvals=list(range(0, 1001, 100)))
fig.update_xaxes(title_text="Number of Potential Destinations", row=2, col=1, tickmode='array', tickvals=list(range(0, 1001, 100)))
fig.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

# Display the figure
fig.show()

# Print summary statistics
print(f"Total number of vessels (up to 1000 potential destinations): {len(ranked_destinations['vessel_id'].unique())}")
print(f"Average number of potential destinations per vessel: {ranked_destinations['num_potential_destinations'].mean():.2f}")
print(f"Median number of potential destinations per vessel: {ranked_destinations['num_potential_destinations'].median():.2f}")
print(f"Range of potential destinations: {ranked_destinations['num_potential_destinations'].min()} to {ranked_destinations['num_potential_destinations'].max()}")
print(f"Overall Top 1 Accuracy: {ranked_destinations['top_1_accuracy'].mean():.2%}")
print(f"Correlation between number of potential destinations and accuracy: {accuracy_by_bin['bin_midpoint'].corr(accuracy_by_bin['mean']):.4f}")

# Additional statistics for vessels with more than 1000 potential destinations
vessels_over_1000 = ranked_destinations[ranked_destinations['num_potential_destinations'] > 1000]
print(f"\nNumber of vessels with more than 1000 potential destinations: {len(vessels_over_1000['vessel_id'].unique())}")
if not vessels_over_1000.empty:
    print(f"Average accuracy for vessels with >1000 potential destinations: {vessels_over_1000['top_1_accuracy'].mean():.2%}")






Total number of vessels (up to 1000 potential destinations): 6299
Average number of potential destinations per vessel: 446.24
Median number of potential destinations per vessel: 449.00
Range of potential destinations: 1 to 992
Overall Top 1 Accuracy: 26.78%
Correlation between number of potential destinations and accuracy: 0.3707

Number of vessels with more than 1000 potential destinations: 0


## Compute accuracy for different vessel types and different traded product family

In [13]:
def compute_top_1_accuracy_by_dimension(dimension: str, top_n: int) -> pd.Series:
    """
    Compute top 1 accuracy for different categories within a specified dimension.

    Args:
        dimension (str): The column name of the dimension to analyze.
        top_n (int): The number of top categories to consider.

    Returns:
        pd.Series: A series containing the top 1 accuracy for each category.
    """
    # Get the top N categories for the specified dimension
    top_categories = ranked_destinations[dimension].value_counts().nlargest(top_n).index

    # Initialize a dictionary to store accuracies
    accuracies = {}

    # Compute accuracy for each top category
    for category in top_categories:
        # Filter the dataset for the current category
        subset = ranked_destinations[ranked_destinations[dimension] == category]
        # Compute and store the mean accuracy for this category
        accuracies[category] = subset.groupby('vessel_id').apply(calculate_top_1_accuracy).mean()

    # Return the accuracies as a pandas Series
    return pd.Series(accuracies)

# Compute top 1 accuracy for top 4 vessel types
vessel_type_accuracy = compute_top_1_accuracy_by_dimension('vessel_type', 4)

# Compute top 1 accuracy for top 5 product families
product_family_accuracy = compute_top_1_accuracy_by_dimension('product_family', 5)

# Create a subplot figure with two rows
fig3 = make_subplots(rows=2, cols=1, subplot_titles=('Top 1 Accuracy by Vessel Type', 'Top 1 Accuracy by Product Family'))

# Add bar chart for vessel type accuracy
fig3.add_trace(go.Bar(
    x=vessel_type_accuracy.index,
    y=vessel_type_accuracy.values,
    text=[f'{acc:.2%}' for acc in vessel_type_accuracy.values],
    textposition='auto'
), row=1, col=1)

# Add bar chart for product family accuracy
fig3.add_trace(go.Bar(
    x=product_family_accuracy.index,
    y=product_family_accuracy.values,
    text=[f'{acc:.2%}' for acc in product_family_accuracy.values],
    textposition='auto'
), row=2, col=1)

# Update layout of the figure
fig3.update_layout(height=800, title_text="Top 1 Accuracy by Vessel Type and Product Family")

# Update x-axis labels
fig3.update_xaxes(title_text="Vessel Type", row=1, col=1)
fig3.update_xaxes(title_text="Product Family", row=2, col=1)

# Update y-axis labels and format
fig3.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=1, col=1)
fig3.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=2, col=1)

# Display the figure
fig3.show()
