<a href="https://colab.research.google.com/github/adadoun/KplerNextDestination/blob/main/AssociationRuleTechnique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import pandas as pd
import numpy as np
from collections import defaultdict
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the training data
train_df = pd.read_csv('drive/MyDrive/Collab_DATA/train_trades_csv_prepared.csv')
test_df = pd.read_csv('drive/MyDrive/Collab_DATA/test_trades_csv_prepared.csv')

In [17]:
# Create a DataFrame with predictions
ranked_destinations = test_df[['vessel_id', 'destination', 'is_visit', 'vessel_type', 'product_family']].copy()
ranked_destinations['predicted_probability'] = test_df['merged_destination_probability']

# Filter predictions (is_visit = 0) and actual destinations (is_visit = 1)
ranked_destinations_filtered = ranked_destinations[ranked_destinations['is_visit'] == 0]
test_df_sample = ranked_destinations[ranked_destinations['is_visit'] == 1]

# Create a dictionary of actual destinations
actual_destinations = test_df_sample.set_index('vessel_id')['destination'].to_dict()

In [19]:
# Function to get top N predictions (unchanged)
def get_top_n_predictions(group, n):
    return group.nlargest(n, 'predicted_probability')['destination'].tolist()

# Modified function to calculate accuracy and get ranks for MRR
def calculate_accuracy_and_ranks(n):
    top_n_predictions = ranked_destinations_filtered.groupby('vessel_id').apply(lambda x: get_top_n_predictions(x, n))
    accuracies = []
    ranks = []
    for vessel_id, preds in top_n_predictions.items():
        actual_dest = actual_destinations.get(vessel_id)
        if actual_dest in preds:
            accuracies.append(1)
            ranks.append(1 / (preds.index(actual_dest) + 1))
        else:
            accuracies.append(0)
            ranks.append(0)
    return np.mean(accuracies), ranks

# Calculate accuracy for top N predictions
def calculate_accuracy_for_top_n(n):
    top_n_predictions = ranked_destinations_filtered.groupby('vessel_id').apply(lambda x: get_top_n_predictions(x, n))
    accuracy = sum(actual_destinations.get(vessel_id) in preds for vessel_id, preds in top_n_predictions.items()) / len(actual_destinations)
    return accuracy

# Function to calculate Mean Reciprocal Rank
def calculate_mrr():
    def get_reciprocal_rank(group):
        vessel_id = group.name
        actual_dest = actual_destinations.get(vessel_id)
        if actual_dest is None:
            return 0
        # Sort predictions by probability in descending order
        sorted_predictions = group.sort_values('predicted_probability', ascending=False).reset_index(drop=True)
        # Find the rank of the actual destination
        rank = sorted_predictions[sorted_predictions['destination'] == actual_dest].index.min()
        if pd.isna(rank):
            return 0
        # Calculate reciprocal rank (add 1 to rank because indexing starts at 0)
        return 1 / (rank + 1)

    reciprocal_ranks = ranked_destinations_filtered.groupby('vessel_id').apply(get_reciprocal_rank)

    # Print some debug information
    print(f"Number of vessels: {len(reciprocal_ranks)}")
    print(f"Number of non-zero ranks: {(reciprocal_ranks > 0).sum()}")
    print(f"Average reciprocal rank: {reciprocal_ranks.mean():.4f}")

    return reciprocal_ranks.mean()



In [20]:
# Calculate Top 1, Top 3, and Top 10 Accuracies
#top_1_accuracy, ranks_1 = calculate_accuracy_and_ranks(1)
#top_3_accuracy, _ = calculate_accuracy_and_ranks(3)
#top_10_accuracy, _ = calculate_accuracy_and_ranks(10)

# Calculate Mean Reciprocal Rank
mrr = calculate_mrr()

# Print the results
print(f"Top 1 Accuracy: {top_1_accuracy:.4f}")
print(f"Top 3 Accuracy: {top_3_accuracy:.4f}")
print(f"Top 10 Accuracy: {top_10_accuracy:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

Number of vessels: 6246
Number of non-zero ranks: 5038
Average reciprocal rank: 0.3293
Top 1 Accuracy: 0.2147
Top 3 Accuracy: 0.3809
Top 10 Accuracy: 0.5621
Mean Reciprocal Rank (MRR): 0.3293


In [9]:
# Calculate accuracies for top 1 to top 10
accuracies = [calculate_accuracy_for_top_n(i) for i in range(1, 11)]

# 1. Plot accuracy evolution from top 1 to top 10
fig1 = go.Figure(data=go.Scatter(
    x=list(range(1, 11)),
    y=accuracies,
    mode='lines+markers+text',
    text=[f'{acc:.2%}' for acc in accuracies],
    textposition='top center'
))
fig1.update_layout(
    title='Accuracy Evolution: Top 1 to Top 10',
    xaxis_title='Top N Predictions',
    yaxis_title='Accuracy',
    yaxis_tickformat='.0%',
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)
fig1.show()

In [13]:
def calculate_top_1_accuracy(group):
    return actual_destinations.get(group.name) == group.nlargest(1, 'predicted_probability')['destination'].iloc[0]

ranked_destinations['num_negative_samples'] = ranked_destinations.groupby('vessel_id')['vessel_id'].transform('count')
ranked_destinations['top_1_accuracy'] = ranked_destinations.groupby('vessel_id').apply(calculate_top_1_accuracy)

# Filter to include only up to 1000 negative samples
ranked_destinations = ranked_destinations[ranked_destinations['num_negative_samples'] <= 1000]

bin_size = 100
max_samples = 1000
bins = range(0, max_samples + bin_size, bin_size)

accuracy_by_bin = ranked_destinations.groupby(pd.cut(ranked_destinations['num_negative_samples'], bins=bins))['top_1_accuracy'].agg(['mean', 'count']).reset_index()
accuracy_by_bin['bin_midpoint'] = accuracy_by_bin['num_negative_samples'].apply(lambda x: x.mid)

# Create subplots: line plot and histogram
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=("Line Plot: Top 1 Accuracy vs Number of Negative Samples (up to 1000)",
                                    "Histogram: Distribution of Negative Samples (up to 1000)"),
                    vertical_spacing=0.1)

# Line plot
fig.add_trace(
    go.Scatter(x=accuracy_by_bin['bin_midpoint'], y=accuracy_by_bin['mean'], mode='lines+markers',
               name='Top 1 Accuracy', text=[f'{acc:.2%}' for acc in accuracy_by_bin['mean']],
               hovertemplate='Negative Samples: %{x}<br>Accuracy: %{text}<br>Count: %{customdata}',
               customdata=accuracy_by_bin['count']),
    row=1, col=1
)

# Histogram
fig.add_trace(
    go.Bar(x=accuracy_by_bin['bin_midpoint'], y=accuracy_by_bin['count'],
           name='Sample Count', hovertemplate='Negative Samples: %{x}<br>Count: %{y}'),
    row=2, col=1
)

# Update layout
fig.update_layout(height=800, title_text="Top 1 Accuracy vs Number of Negative Samples (up to 1000)")
fig.update_xaxes(title_text="Number of Negative Samples", row=1, col=1, tickmode='array', tickvals=list(range(0, 1001, 100)))
fig.update_xaxes(title_text="Number of Negative Samples", row=2, col=1, tickmode='array', tickvals=list(range(0, 1001, 100)))
fig.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.show()

# Print some statistics
print(f"Total number of vessels (up to 1000 negative samples): {len(ranked_destinations['vessel_id'].unique())}")
print(f"Average number of negative samples per vessel: {ranked_destinations['num_negative_samples'].mean():.2f}")
print(f"Median number of negative samples per vessel: {ranked_destinations['num_negative_samples'].median():.2f}")
print(f"Range of negative samples: {ranked_destinations['num_negative_samples'].min()} to {ranked_destinations['num_negative_samples'].max()}")
print(f"Overall Top 1 Accuracy: {ranked_destinations['top_1_accuracy'].mean():.2%}")
print(f"Correlation between number of negative samples and accuracy: {accuracy_by_bin['bin_midpoint'].corr(accuracy_by_bin['mean']):.4f}")

# Additional statistics for vessels with more than 1000 negative samples
vessels_over_1000 = ranked_destinations[ranked_destinations['num_negative_samples'] > 1000]
print(f"\nNumber of vessels with more than 1000 negative samples: {len(vessels_over_1000['vessel_id'].unique())}")
if not vessels_over_1000.empty:
    print(f"Average accuracy for vessels with >1000 negative samples: {vessels_over_1000['top_1_accuracy'].mean():.2%}")






Total number of vessels (up to 1000 negative samples): 6299
Average number of negative samples per vessel: 446.24
Median number of negative samples per vessel: 449.00
Range of negative samples: 1 to 992
Overall Top 1 Accuracy: 26.73%
Correlation between number of negative samples and accuracy: 0.3694

Number of vessels with more than 1000 negative samples: 0


In [18]:
# 3. Compute top 1 accuracy for different dimensions
def compute_top_1_accuracy_by_dimension(dimension, top_n):
    top_categories = ranked_destinations[dimension].value_counts().nlargest(top_n).index
    accuracies = {}
    for category in top_categories:
        subset = ranked_destinations[ranked_destinations[dimension] == category]
        accuracies[category] = subset.groupby('vessel_id').apply(calculate_top_1_accuracy).mean()
    return pd.Series(accuracies)

vessel_type_accuracy = compute_top_1_accuracy_by_dimension('vessel_type', 4)
product_family_accuracy = compute_top_1_accuracy_by_dimension('product_family', 5)

fig3 = make_subplots(rows=2, cols=1, subplot_titles=('Top 1 Accuracy by Vessel Type', 'Top 1 Accuracy by Product Family'))

fig3.add_trace(go.Bar(
    x=vessel_type_accuracy.index,
    y=vessel_type_accuracy.values,
    text=[f'{acc:.2%}' for acc in vessel_type_accuracy.values],
    textposition='auto'
), row=1, col=1)

fig3.add_trace(go.Bar(
    x=product_family_accuracy.index,
    y=product_family_accuracy.values,
    text=[f'{acc:.2%}' for acc in product_family_accuracy.values],
    textposition='auto'
), row=2, col=1)

fig3.update_layout(height=800, title_text="Top 1 Accuracy by Vessel Type and Product Family")
fig3.update_xaxes(title_text="Vessel Type", row=1, col=1)
fig3.update_xaxes(title_text="Product Family", row=2, col=1)
fig3.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=1, col=1)
fig3.update_yaxes(title_text="Top 1 Accuracy", tickformat='.0%', row=2, col=1)

fig3.show()
