In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import great_circle, geodesic
from sklearn.cluster import DBSCAN, KMeans
import requests
from datetime import datetime as dt
from datetime import date
from prefect import task, flow
import logging
import pyodbc
import os
from dotenv import load_dotenv
# import pymssql
import time
from concurrent.futures import ThreadPoolExecutor
import string
import logging

In [2]:
## Logging setup
main_dir = './Algorithm_V1/Van-Route Optimization/Omnibiz'
#Create logs directory
os.makedirs(f'{main_dir}', exist_ok=True)



#Configure logging to save to a file
logging.basicConfig(
    filename=f'{main_dir}/pipeline.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.getLogger('prefect').setLevel(logging.INFO)


### Function: Data Generation

In [3]:
#StockPoint LatLong
@task
def fetch_stockpoint_data():
    warehouse_df = pd.DataFrame({
        "StockPointName" : "Ijora-Causeway MFC",
        "StockPointId" : 1647113,
        "StockPoint_Latitude" : 6.46965,
        "StockPoint_Longitude" : 3.36888
    }, index = [0])

    return warehouse_df


#Generate 10 unique 6-character alphanumeric VehicleNumbers
@task
def generate_alphanumeric_vehicle_number(size=6):
    chars = string.ascii_uppercase + string.digits  # A-Z, 0-9
    return ''.join(np.random.choice(list(chars), size=size))


#available vehicles
@task
def fetch_vehicle_data():
    vehicle_numbers = [generate_alphanumeric_vehicle_number() for _ in range(10)]
    # Ensure uniqueness
    while len(set(vehicle_numbers)) < 10:
        vehicle_numbers = [generate_alphanumeric_vehicle_number() for _ in range(10)]

    vehicle_df = pd.DataFrame({
        'VehicleNumber': vehicle_numbers,
        'VehicleCapacity': np.random.randint(1500, 1701, size=10)
    })

    vehicle_df["Total_Loaded_Quantity"] = 0

    return vehicle_df


In [18]:
#Orders_Ready_For_Dispatch
@task
def fetch_order_data():
    causeway = pd.read_excel('./recommendation_output/2025-06-20/OmniHub Apapa Lagos - CAUSEWAY_2025-06-20.xlsx', sheet_name='All Recommendation')
    causeway = causeway[['Latitude', 'Longitude']].drop_duplicates().rename(columns = {'Latitude':"Customer_Latitude", 'Longitude':"Customer_Longitude"})
    causeway["Customer_Latitude"] = causeway["Customer_Latitude"].astype(float)
    causeway["Customer_Longitude"] = causeway["Customer_Longitude"].astype(float)

    return causeway
    

### Function: Clustering

In [5]:
@task
def calculate_pathway_distance(row, another_parameter1 = None, another_parameter2 = None):
    if another_parameter1 == None:
        #Specify coordinates for two locations
        start_coords = (row['StockPoint_Latitude'], row['StockPoint_Longitude'])
        end_coords = (row['Customer_Latitude'], row['Customer_Longitude'])

        #Format coordinates for OSRM API
        url = f"http://router.project-osrm.org/route/v1/driving/{start_coords[1]},{start_coords[0]};{end_coords[1]},{end_coords[0]}?overview=false"

        #Send the request to the OSRM API
        response = requests.get(url)
        data = response.json()

        #Extract distance in meters
        try:
            distance_meters = data['routes'][0]['distance']
        except:
            return np.nan
        distance_km = distance_meters / 1000  # Convert to kilometers
        # print(f"Distance: {distance_km} km")

        return distance_km
    
    else:
        #Specify coordinates for two locations
        start_coords = row
        end_coords = (another_parameter1, another_parameter2)

        #Format coordinates for OSRM API
        url = f"http://router.project-osrm.org/route/v1/driving/{start_coords[1]},{start_coords[0]};{end_coords[1]},{end_coords[0]}?overview=false"

        # Send the request to the OSRM API
        response = requests.get(url)
        data = response.json()

        #Extract distance in meters
        try:
            distance_meters = data['routes'][0]['distance']
        except:
            return np.nan
        distance_km = distance_meters / 1000  # Convert to kilometers
        #print(f"Distance: {distance_km} km")

        return distance_km


@task
def calculate_batch_distances(current_location, customers, batch_size=20):
    base_url = "http://router.project-osrm.org/route/v1/driving/"
    distances = pd.Series(np.nan, index=customers.index)
    
    #Prepare source and destinations
    source = f"{current_location[1]},{current_location[0]}"  #lon,lat
    destinations = customers[['Customer_Longitude', 'Customer_Latitude']].values.tolist()
    
    def fetch_distance(batch_dests, batch_indices):
        #Format coordinates for OSRM multi-point route
        coords = [source] + [f"{lon},{lat}" for lon, lat in batch_dests]
        url = base_url + ";".join(coords) + "?overview=false"
        
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            data = response.json()
            #Extract leg distances (between consecutive points)
            leg_distances = [leg['distance'] / 1000 for leg in data['routes'][0]['legs']]  #km
            return batch_indices, leg_distances
        except:
            return batch_indices, [np.nan] * len(batch_dests)
    
    #Process in batches with parallel requests
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i in range(0, len(destinations), batch_size):
            batch_dests = destinations[i:i + batch_size]
            batch_indices = customers.index[i:i + batch_size]
            futures.append(executor.submit(fetch_distance, batch_dests, batch_indices))
            time.sleep(0.1)  #Avoid overwhelming public API
        
        for future in futures:
            batch_indices, batch_distances = future.result()
            distances.loc[batch_indices] = batch_distances
    
    return distances


In [6]:
@task
def optimal_kmeans_clusters(coords, max_k=10): #max_k was formerly 7
    # Compute inertia (sum of squared distances) for a range of k values
    distortions = []

    if max_k > len(coords):
        max_k = len(coords)

    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42).fit(coords)
        
        distortions.append(kmeans.inertia_)
    
    # Automatically detect the "elbow" (where inertia starts decreasing slower)
    # Compute the second derivative to find the elbow point
    deltas = np.diff(distortions)
    deltas2 = np.diff(deltas)  # 2nd derivative
    # optimal_k = np.argmin(deltas2) + 2  # Add 2 because we applied diff() twice

    #This simply gives the k number that returned the least inertia
    optimal_k = distortions.index(min(distortions)) + 1 #Added 1 because python numbering/indexing starts from zero

    return optimal_k


@task
def get_customer_cluster(final_order_df, eps_km=2, min_samples=5, max_k=10):   #max_k formerly 7
    kms_per_radian = 6371.0088  
    epsilon = eps_km / kms_per_radian 
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine')

    customer_coords = np.radians(final_order_df[['Customer_Latitude', 'Customer_Longitude']])

    customer_coords['Customer_Latitude'] = customer_coords['Customer_Latitude'].astype(str)
    customer_coords = customer_coords[customer_coords['Customer_Latitude'].str.lower() != 'nan']

    customer_coords['Customer_Latitude'] = customer_coords['Customer_Latitude'].astype(float)

    #Temp Resolution for final_order_df
    final_order_df['Customer_Latitude'] = final_order_df['Customer_Latitude'].astype(str)
    final_order_df = pd.DataFrame(final_order_df[final_order_df['Customer_Latitude'].str.lower() != 'nan'])
    final_order_df['Customer_Latitude'] = final_order_df['Customer_Latitude'].astype(float)

    #cluster assignment
    final_order_df['cluster'] = db.fit_predict(customer_coords)

    final_order_df['is_noise'] = (final_order_df['cluster'] == -1).astype(int)
    noise_points = pd.DataFrame(final_order_df[final_order_df['is_noise'] == 1])
    clustered_points =pd.DataFrame(final_order_df[final_order_df['is_noise'] == 0])

    #Using 50th percentile to consider where most of the points are within (neglecting 50%)
    max_noise_distance_benchmark = np.quantile(noise_points["Distance_From_StockPoint"].unique(), 0.50) #noise_points["Distance_From_StockPoint"].unique().quantile(0.70)  

    #Using 500m to give a sense of how clubbed we want the clusters to be
    proposed_number_of_clusters = round(max_noise_distance_benchmark / 2)
    if proposed_number_of_clusters < 1:  #incase the division cannot be rounded up to 1
        proposed_number_of_clusters = 1

    
    if not noise_points.empty: #and not clustered_points.empty:
        noise_coords = np.radians(noise_points[['Customer_Latitude', 'Customer_Longitude']].values)
        clustered_coords = np.radians(clustered_points[['Customer_Latitude', 'Customer_Longitude']].values)
        optimal_clusters = optimal_kmeans_clusters(noise_coords, max_k=proposed_number_of_clusters)   #formerly clustered_coords
        
        kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
        noise_clusters = kmeans.fit_predict(noise_coords)

        # final_order_df.loc[noise_points.index, 'cluster'] = noise_clusters + final_order_df['cluster'].max() + 1
        final_order_df['cluster'].loc[noise_points.index] = noise_clusters + final_order_df['cluster'].max() + 1

    
    return final_order_df


@task
def calculate_cluster_quantity(final_order_df):
    cluster_info_df = final_order_df.groupby('cluster').agg({
        'Quantity': 'sum',
        'Customer_Latitude': 'mean',
        'Customer_Longitude': 'mean',
        'StockPoint_Latitude': 'first',
        'StockPoint_Longitude': 'first'
    }).reset_index()
    
    cluster_info_df['Distance_From_StockPoint'] = cluster_info_df.apply(calculate_pathway_distance, axis=1)
    
    return cluster_info_df


### Run Logic

In [19]:
orders_csv = fetch_order_data()
warehouse_csv = fetch_stockpoint_data()
vehicle_df = fetch_vehicle_data()

In [20]:
orders_df = orders_csv.copy()
orders_df['BusinessId'] = warehouse_csv['StockPointId'][0]
orders_df = pd.DataFrame(pd.merge(left = orders_df, right = warehouse_csv, left_on = "BusinessId", right_on = "StockPointId", how = "inner"))

orders_df['Distance_From_StockPoint'] = orders_df.apply(calculate_pathway_distance, axis=1) 

orders_df.dropna(axis = 0, how = "all", inplace = True) 

In [21]:
final_order_df = get_customer_cluster(orders_df, eps_km=2, min_samples=5)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  final_order_df['cluster'].loc[noise_points.index] = noise_clusters + final_order_df['cluster'].max() + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [26]:
final_order_df.cluster.value_counts()
final_order_df.columns

Index(['Customer_Latitude', 'Customer_Longitude', 'BusinessId',
       'StockPointName', 'StockPointId', 'StockPoint_Latitude',
       'StockPoint_Longitude', 'Distance_From_StockPoint', 'cluster',
       'is_noise'],
      dtype='object')

In [27]:

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
def evaluate_unsupervised_clustering(df):
    # Usage:
    X = df[['Customer_Latitude', 'Customer_Longitude']].values
    labels = df['cluster'].values
    scores = {
        "Silhouette Score":  silhouette_score(X, labels).round(2),
        "Davies-Bouldin Index": davies_bouldin_score(X, labels).round(2),
        "Calinski-Harabasz Score": calinski_harabasz_score(X, labels).round(2)
    }

    for key in scores:
        print(f"{key}: {scores[key]}")
    return scores

In [28]:
evaluate_unsupervised_clustering(final_order_df)

Silhouette Score: 0.32
Davies-Bouldin Index: 0.4
Calinski-Harabasz Score: 31.68


{'Silhouette Score': np.float64(0.32),
 'Davies-Bouldin Index': np.float64(0.4),
 'Calinski-Harabasz Score': np.float64(31.68)}

In [None]:
@flow(log_prints = True)
def execute_algorithm():
    pd.set_option('display.max_columns', None)
    # stock_csv = fetch_current_stock_data()
    orders_csv = fetch_order_data()
    warehouse_csv = fetch_stockpoint_data()
    vehicle_df = fetch_vehicle_data()

    # van_recommendation_csv = f"C:/Users/ME/OneDrive - Mplify Limited/Omnibiz Africa/Projects/GIT/Personal/Van-Route Optimization/Omnibiz/Van_Recommendation {current_date}.csv"


    # orders_df = pd.DataFrame(pd.merge(left = orders_csv, right = warehouse_csv, left_on = "BusinessId", right_on = "StockPointId", how = "inner"))
    orders_df['Distance_From_StockPoint'] = orders_df.apply(calculate_pathway_distance, axis=1) 
    # orders_df = pd.DataFrame(stock_check(stock_csv, orders_df))  
    # orders_df = customer_order_clubbing_check(orders_df, vehicle_df)

    orders_df.dropna(axis = 0, how = "all", inplace = True)  

    final_order_df = get_customer_cluster(orders_df, eps_km=2, min_samples=5)

    cluster_info_df = calculate_cluster_quantity(final_order_df)

    # van_recommendations = assign_vans_to_clusters(cluster_info_df, final_order_df, vehicle_df)
 
    # van_recommendations = calculate_van_route(van_recommendations)

    # print("Total Customers: ", van_recommendations["CustomerId"].nunique())
    # print("Total Vehicles: ", van_recommendations["VehicleNumber"].nunique())
    # print("Shortest Distance Covered: ", van_recommendations["Van_Min_Distance"].min())
    # print("Longest Distance Covered: ", van_recommendations["Van_Max_Distance"].max())

    # #save the loading & routing solution to a file
    # van_recommendations.to_csv(van_recommendation_csv, index = False)



In [None]:
execute_algorithm()