In [57]:
# Import required packages
import random
import pandas as pd
from faker import Faker
import datetime
from datetime import date
import numpy as np
import warnings


In [58]:
order_size=4# 1 to 5, 1 for smaller dataset


# how many years do we want to simulate?
simulation_duration= 3 #years

# Define number of locations of each type
num_ports = 1
num_customer_dcs = 2
num_customer_stores = 30
num_carrier_warehouses = 1

# Define number of trailers of each type
num_trailers = 40
num_refrigs = 5
num_city_delivs = 30
#define number of drivers
num_drivers = 55 # number of driver

pct_class1_driver=0.7 # % of class one driver


### Set Dataframes for Locations, Orders , Equipments, Drivers and Vacations

In [59]:
warnings.filterwarnings('ignore')

# Initialize Faker
fake = Faker()

# Set seed for reproducibility
random.seed(42)

# Function to generate random latitude and longitude around a given point
def generate_coords(lat, long, delta=0.1):
    return round(lat + random.uniform(-delta, delta), 4), round(long + random.uniform(-delta, delta), 4)

# Create Locations DataFrame
locations_data = {
    'Location Name': [],
    'Location Type': ['Port']*num_ports + ['Customer DC']*num_customer_dcs + ['Customer Store']*num_customer_stores + ['Carrier Warehouse']*num_carrier_warehouses,
    'Location Latitude': [],
    'Location Longitude': []
}

for i, loc_type in enumerate(locations_data['Location Type']):
    loc_name = f'{loc_type} Location {i+1}'
    locations_data['Location Name'].append(loc_name)
    lat, long = generate_coords(45.5019, -73.5674)
    locations_data['Location Latitude'].append(lat)
    locations_data['Location Longitude'].append(long)

locations_df = pd.DataFrame(locations_data)

# Create Trailers DataFrame
trailers_data = {
    'Equipment ID': [f'TR-{str(i).zfill(3)}' for i in range(1, num_trailers + 1)] + [f'RE-{str(i).zfill(3)}' for i in range(1, num_refrigs + 1)] + [f'CD-{str(i).zfill(3)}' for i in range(1, num_city_delivs + 1)],
    'Equipment Type': ['Trailer']*num_trailers + ['Refrigerated Truck']*num_refrigs + ['City Delivery']*num_city_delivs,
    'Capacity': [2000]*num_trailers + [2000]*num_refrigs + [700]*num_city_delivs
}

trailers_df = pd.DataFrame(trailers_data)

# Create Orders DataFrame
orders_data = {
    'Order ID': [],
    'Pickup Date': [],
    'Lead Time': [],
    'Volume': [],
    'Event': []   
}

# Generate dates and orders for the past 2 years, with 75-120 orders each day
start_date = (datetime.datetime.today() - datetime.timedelta(days=simulation_duration* 365)).date()
dates = pd.date_range(start=start_date, end=datetime.datetime.today().date() + datetime.timedelta(days=14))

# Define trend and seasonality patterns
trend_pattern = [i for i in range(100, 250, 5)]
seasonality_pattern = [0.8, 0.9, 1.1, 1.2, 1.3, 1.4, 1.3, 1.2, 1.1, 1.0, 0.9, 0.8]

for i, date in enumerate(dates):
    # Calculate trend value based on the pattern
    trend_value = trend_pattern[i % len(trend_pattern)]

    # Calculate seasonality value based on the pattern
    seasonality_value = seasonality_pattern[date.month - 1]

    # Calculate the number of orders based on trend and seasonality
    factor= 5/order_size
    num_orders = int((trend_value * seasonality_value)/ factor)

    for _ in range(num_orders):
        orders_data['Order ID'].append(len(orders_data['Order ID']) + 1)
        orders_data['Pickup Date'].append(date.date())
        orders_data['Lead Time'].append(random.randint(0, 5))
        orders_data['Volume'].append(random.randint(200, 2000))
        orders_data['Event'].append(0)

orders_df = pd.DataFrame(orders_data)

# Add other fields based on initial fields
orders_df['Delivery Type'] = orders_df['Lead Time'].apply(lambda x: 'Not Direct' if x > 1 else 'Direct')
orders_df['Delivery Date'] = (pd.to_datetime(orders_df['Pickup Date']) + pd.to_timedelta(orders_df['Lead Time'], unit='d')).dt.date
orders_df['Pickup Address'] = random.choices(locations_df[locations_df['Location Type'].isin(['Port', 'Customer DC'])]['Location Name'].values, k=len(orders_df))
orders_df['Delivery Location'] = random.choices(locations_df[locations_df['Location Type'] == 'Customer Store']['Location Name'].values, k=len(orders_df))

# Set all orders to 'Not Picked up'
orders_df['Status'] = 'Not Picked up'

# Add Trailer Requirement based on Volume
def trailer_requirement(volume):
    if volume < 700:
        return 'City Delivery'
    elif random.random() < 0.05:   # 5% probability of needing a refrigerated trailer
        return 'Refrigerated Truck'
    else:
        return 'Trailer'

orders_df['Trailer Requirement'] = orders_df['Volume'].apply(trailer_requirement)


# Driver information
drivers_data = {
    'Driver ID': [f'DR-{str(i).zfill(3)}' for i in range(1, num_drivers + 1)],
    'Driver Name': [fake.name() for _ in range(num_drivers)],
    'Hired Date': [fake.date_between(start_date='-5y', end_date='today') for _ in range(num_drivers)],
    'License Type': ['Class 1' if random.random() < pct_class1_driver else 'Class 2' for _ in range(num_drivers)],
}

drivers_df = pd.DataFrame(drivers_data)
# Add seniority column to drivers_df
today = datetime.date.today()
drivers_df['Seniority'] = (today - pd.to_datetime(drivers_df['Hired Date']).dt.date).dt.days // 365

# Function to assign vacation days based on seniority
def assign_vacation_days(seniority):
    if seniority < 1:
        return 2
    elif seniority < 2:
        return 3
    elif seniority < 3:
        return 4
    elif seniority < 4:
        return 5
    else:
        return 6

# Assign vacation days
drivers_df['Vacation Days'] = drivers_df['Seniority'].apply(assign_vacation_days)

# Initialize vacations DataFrame
vacations_df = pd.DataFrame(columns=['Driver ID', 'Start Date', 'End Date'])

# Generate random vacation dates
for _, driver in drivers_df.iterrows():
    for _ in range(driver['Vacation Days']):
        start_date = fake.date_between(start_date='-1y', end_date='today')
        end_date = start_date + datetime.timedelta(days=random.randint(1, 7))  # vacation lasts 1-7 days
        vacations_df = vacations_df.append({
            'Driver ID': driver['Driver ID'],
            'Start Date': start_date,
            'End Date': end_date
        }, ignore_index=True)


### Equipement Unavailability
sometimes, equipments might be out of order 

In [60]:
# Initialize Unavailability DataFrame
unavailability_data = {
    'Date': [],
    'Type': [],  # 'Equipment'
    'ID': []  # Equipment ID
}
unavailability_df = pd.DataFrame(unavailability_data)

# Each day, randomly select 5% of equipment to be unavailable
dates = pd.date_range(start=orders_df['Pickup Date'].min(), end=orders_df['Pickup Date'].max())

for date in dates:
    # Equipment
    num_equipment_unavailable = round(0.05 * len(trailers_df))  # 5% of total equipment
    unavailable_equipment = trailers_df.sample(num_equipment_unavailable)['Equipment ID'].tolist()
    
    for equipment_id in unavailable_equipment:
        unavailability_df = unavailability_df.append({
            'Date': date,
            'Type': 'Equipment',
            'ID': equipment_id
        }, ignore_index=True)

unavailability_df.head()


Unnamed: 0,Date,Type,ID
0,2020-08-07,Equipment,CD-024
1,2020-08-07,Equipment,CD-013
2,2020-08-07,Equipment,CD-018
3,2020-08-07,Equipment,TR-019
4,2020-08-08,Equipment,CD-021


### Lets schedule the drivers that are not on vacation

In [61]:
# Initialize Schedule DataFrame
schedule_data = {
    'Date': [],
    'Driver ID': [],
    'Equipment ID': [],
    'Equipment Available': [],
    'License Type': [],
    'Equipment Type': []
}

schedule_df = pd.DataFrame(schedule_data)

# Sort drivers DataFrame so that 'Class 1' drivers come first
sorted_drivers_df = drivers_df.sort_values(by='License Type', ascending=False)

# Determine the earliest and latest dates from the orders
start_date = orders_df['Pickup Date'].min()
end_date = min(orders_df['Delivery Date'].max(), datetime.date.today())

# Assign equipment to drivers based on their license type
for _, driver in sorted_drivers_df.iterrows():
    # Assign equipment to each driver for every day in the order date range
    date_range = pd.date_range(start=start_date, end=end_date)

    for date in date_range:
        # Check if this driver is on vacation or unavailable on this day
        is_on_vacation = not vacations_df[(vacations_df['Driver ID'] == driver['Driver ID']) &
                                          (vacations_df['Start Date'] <= date) &
                                          (vacations_df['End Date'] >= date)].empty
        is_unavailable = not unavailability_df[(unavailability_df['Date'] == date) &
                                                (unavailability_df['Type'] == 'Driver') &
                                                (unavailability_df['ID'] == driver['Driver ID'])].empty

        if is_on_vacation or is_unavailable:
            continue

        # Choose a random available equipment that the driver is qualified to operate
        if driver['License Type'] == 'Class 1':  
            available_equipment = trailers_df[
                ~trailers_df['Equipment ID'].isin(schedule_df[schedule_df['Date'] == date]['Equipment ID'])
            ]
        else:  # 'Class 2'
            available_equipment = trailers_df[
                (trailers_df['Equipment Type'] == 'City Delivery') &
                ~trailers_df['Equipment ID'].isin(schedule_df[schedule_df['Date'] == date]['Equipment ID'])
            ]

        # If there is no available equipment, note 'No Equipment' for this driver
        if available_equipment.empty:
            assigned_equipment_id = None
            assigned_equipment_type = None
            equipment_available = False
        else:
            assigned_equipment = available_equipment.sample(1).iloc[0]
            assigned_equipment_id = assigned_equipment['Equipment ID']
            assigned_equipment_type = assigned_equipment['Equipment Type']
            equipment_available = True

        # Add the assignment to the schedule
        schedule_df = schedule_df.append({
            'Date': date,
            'Driver ID': driver['Driver ID'],
            'Equipment ID': assigned_equipment_id,
            'Equipment Available': equipment_available,
            'License Type': driver['License Type'],
            'Equipment Type': assigned_equipment_type
        }, ignore_index=True)

#update.. date should be in datetime format
schedule_df['Date'] = schedule_df['Date'].dt.date


### Drivers on the schedule can now be matched with orders
!!! The next cell might take a long time to run depending on your data size

In [62]:
# Initialize new columns for the Driver ID, Equipment ID, and Actual Delivery Date in the order DataFrame
orders_df['Driver ID'] = None
orders_df['Equipment ID'] = None
orders_df['Actual Delivery Date'] = orders_df['Delivery Date']
orders_df['Not Delivered Reason'] = None
orders_df['Actual Pickup Date'] = orders_df['Pickup Date']
orders_df['Event'] = 0



# Initialize a list to hold unassigned orders
unassigned_orders = []

for date in sorted(orders_df['Pickup Date'].unique()):
    # Skip if the date is in the future
    if date > datetime.date.today():
        continue

    # Get orders for this date
    orders_for_date = orders_df[orders_df['Pickup Date'] == date].copy()

    # Get available assignments for this date
    available_assignments = schedule_df[schedule_df['Date'] == date]

    # If no available assignments for this date, reschedule unassigned orders
    if len(available_assignments) == 0:
        for _, order in orders_for_date.iterrows():
            unassigned_orders.append(order)
            orders_df.at[order.name, 'Event'] = 3
            orders_df.at[order.name, 'Actual Pickup Date'] = date + datetime.timedelta(days=1)
            orders_df.at[order.name, 'Status'] = "Rescheduled"
    # Loop through each driver available this day
    for driver_id in available_assignments['Driver ID'].unique():
        # Get assignments for this driver
        driver_assignments = available_assignments[available_assignments['Driver ID'] == driver_id]

        # Loop through each equipment this driver can use
        for _, assignment in driver_assignments.iterrows():
            # Get orders that require this type of equipment
            suitable_orders = orders_for_date[orders_for_date['Trailer Requirement'] == assignment['Equipment Type']]
            # Define the possible choices and their corresponding probabilities
            choices = [2, 3, 4]
            probabilities = [0.1,0.6, 0.3]  # 60% chance for 3, 20% chance for 4, and 20% chance for 5

             # Use np.random.choice to select a number of orders based on the defined probabilities
            num_orders = np.random.choice(choices, p=probabilities)

               # Select the determined number of orders
            selected_orders = suitable_orders.sample(min(len(suitable_orders), num_orders))
            # Randomly select 2 to 5 orders for this driver and this equipment
            selected_orders = suitable_orders.sample(min(len(suitable_orders), np.random.randint(2, 6)))

            # Assign the driver and equipment to the selected orders
            for _, order in selected_orders.iterrows():
                orders_df.at[order.name, 'Driver ID'] = driver_id
                orders_df.at[order.name, 'Equipment ID'] = assignment['Equipment ID']
                orders_df.at[order.name, 'Status'] = 'Delivered'
                # Remove assigned orders from orders_for_date
                orders_for_date = orders_for_date.drop(order.name)

    # Add unassigned orders to the unassigned_orders list
    unassigned_orders.append(orders_for_date)

# Concatenate all unassigned orders into a single DataFrame for easy processing
unassigned_orders_df = pd.concat(unassigned_orders)

# For unassigned orders that have been unassigned for more than, for example, 2 days, outsource them
for _, order in unassigned_orders_df.iterrows():
    if (date - order['Pickup Date']).days > 2:
        orders_df.at[order.name, 'Driver ID'] = 'Outsourced'
        orders_df.at[order.name, 'Equipment ID'] = 'Outsourced'
        orders_df.at[order.name, 'Status'] = 'Delivered'
        orders_df.at[order.name, 'Event'] = 3


        unassigned_orders_df = unassigned_orders_df.drop(order.name)


# set future 'Actual_Delivery_Date' to None
orders_df.loc[orders_df['Actual Delivery Date'] > datetime.date.today(), 'Actual Delivery Date'] = None


# Validation

In [63]:
# Logical consistency checks:

# 1. Check if 'Driver ID' and 'Equipment ID' are filled when 'Status' is 'Delivered'
delivered_orders = orders_df['Status'] == 'Delivered'
missing_driver_or_equipment = orders_df['Driver ID'].isnull() | orders_df['Equipment ID'].isnull()
inconsistent_delivered_orders = delivered_orders & missing_driver_or_equipment

# 2. Check if 'Not Delivered Reason' is filled when 'Status' is 'Not Delivered'
not_delivered_orders = orders_df['Status'] == 'Not Delivered' "Not Picked up"
missing_not_delivered_reason = orders_df['Not Delivered Reason'].isnull()
inconsistent_not_delivered_orders = not_delivered_orders & missing_not_delivered_reason

# 3. Check if 'Actual Pickup Date' is later than 'Pickup Date', ignoring rows where 'Actual Pickup Date' is null
actual_pickup_not_null = orders_df['Actual Pickup Date'].notnull()
inconsistent_pickup_dates = actual_pickup_not_null & (orders_df['Actual Pickup Date'] < orders_df['Pickup Date'])

# 4. Check if 'Actual Delivery Date' is later than 'Delivery Date'
inconsistent_delivery_dates = orders_df['Actual Delivery Date'] < orders_df['Delivery Date']

# 5. Check if 'Actual Pickup Date' is later than 'Actual Delivery Date'
inconsistent_actual_dates = orders_df['Actual Pickup Date'] > orders_df['Actual Delivery Date']


# Combine all checks
inconsistencies = pd.DataFrame({
    'Inconsistent Delivered Orders': inconsistent_delivered_orders,
    'Inconsistent Not Delivered Orders': inconsistent_not_delivered_orders,
    'Inconsistent Pickup Dates': inconsistent_pickup_dates,
    'Inconsistent Delivery Dates': inconsistent_delivery_dates,
    'Inconsistent Actual Dates': inconsistent_actual_dates,
})
# Calculate the number of assigned orders
assigned_orders = (~orders_df['Driver ID'].isnull()) & (~orders_df['Equipment ID'].isnull())
not_assigned_orders = (orders_df['Driver ID'].isnull()) & (orders_df['Equipment ID'].isnull())

# Add to the inconsistencies DataFrame
inconsistencies['Assigned Orders'] = assigned_orders
inconsistencies[' notAssigned Orders'] =not_assigned_orders

# Display number of inconsistencies for each check
print(inconsistencies.sum())

# Display number of inconsistencies for each check
inconsistencies.sum()
# Check if there are any orders where the 'Pickup Date' is later than the 'Delivery Date'
invalid_delivery_dates = orders_df[orders_df['Pickup Date'] > orders_df['Delivery Date']]

# Check if there are any orders where the 'Actual Delivery Date' is in the future
future_actual_delivery_dates = orders_df[orders_df['Actual Delivery Date'] > datetime.date.today()]

# Print the results
if not invalid_delivery_dates.empty:
    print("There are orders with 'Pickup Date' later than 'Delivery Date'.")
else:
    print("All orders have 'Pickup Date' earlier than or equal to 'Delivery Date'.")

if not future_actual_delivery_dates.empty:
    print("There are orders with 'Actual Delivery Date' in the future.")
else:
    print("All orders have 'Actual Delivery Date' in the past or today.")


Inconsistent Delivered Orders             0
Inconsistent Not Delivered Orders         0
Inconsistent Pickup Dates                 0
Inconsistent Delivery Dates               0
Inconsistent Actual Dates                 0
Assigned Orders                      163062
 notAssigned Orders                    2850
dtype: int64
All orders have 'Pickup Date' earlier than or equal to 'Delivery Date'.
All orders have 'Actual Delivery Date' in the past or today.


### Create events , cost and profit

In [67]:
# Initialize new columns for Cost and Revenue in the order DataFrame
orders_df['Cost'] = 0.0
orders_df['Revenue'] = 0.0

# Generate events and calculate cost and revenue
for i, order in orders_df.iterrows():
    # Skip orders with a pickup date in the future
    if order['Pickup Date'] > datetime.date.today():
        continue

    # Get driver experience
    driver_experience_values = drivers_df.loc[drivers_df['Driver ID'] == order['Driver ID'], 'Seniority'].values

    if driver_experience_values.size > 0:
        driver_experience = driver_experience_values[0]
    else:
        driver_experience = 0  # or some other default value

    # Basic cost and revenue for delivered orders
    if order['Status'] == 'Delivered':
        orders_df.at[i, 'Cost'] = 5 * order['Volume'] + 200  # cost per unit of volume + overhead cost
        orders_df.at[i, 'Revenue'] = 10 * order['Volume']  # revenue per unit of volume

    # Generate event based on correlations and adjust cost and revenue for non-rescheduled orders
    if order['Event'] != 3:
        # Cancellation
        if order['Volume'] > 1900 and random.random() < (0.05 + (0.02 if driver_experience < 2 else 0)):
            orders_df.at[i, 'Event'] = 4
            orders_df.at[i, 'Driver ID'] = None
            orders_df.at[i, 'Equiment ID'] = None
            orders_df.at[i, 'Status'] = "Canceled"
            orders_df.at[i, 'Cost'] += 5000  # arbitrary penalty for cancellation

        # Damages
        elif order['Trailer Requirement'] == 'Refrigerated Truck' and random.random() < (0.30 + (0.05 if driver_experience < 2 else 0)):
            orders_df.at[i, 'Event'] = 5
            orders_df.at[i, 'Cost'] += 2000  # arbitrary penalty for damage

        # Accident
        elif driver_experience < 1 and random.random() < 0.21:
            if (order['Trailer Requirement'] == 'Refrigerated Truck' and random.random() < 0.15) or random.random() < 0.10 :
                orders_df.at[i, 'Event'] = 6
                orders_df.at[i, 'Cost'] += 5000  # arbitrary penalty for accident

        # Complaint
        elif order['Volume'] > 1000 and random.random() < (0.40 + (0.10 if driver_experience < 2 else 0)):
            orders_df.at[i, 'Event'] = 2
            orders_df.at[i, 'Cost'] += 500  # arbitrary penalty for complaint

        # Delay
        elif order['Lead Time'] > 3 and random.random() < (0.6 + (0.1 if driver_experience < 2 else 0)):
            orders_df.at[i, 'Event'] = 1
            orders_df.at[i, 'Cost'] += 100  # arbitrary penalty for delay

events_data = {
    'Event ID': [0, 1, 2, 3, 4, 5,6],
    'Event Description': ['No issues', 'Delay', 'Complaint','Rescheduled', 'Cancellation', 'Damages', 'Accidents']
}
event_description_df = pd.DataFrame(events_data)


In [69]:
### Save to Excel
with pd.ExcelWriter('EfficientRide_Dataset.xlsx', engine='openpyxl') as writer:
    locations_df.to_excel(writer, sheet_name='Locations', index=False)
    trailers_df.to_excel(writer, sheet_name='Trailers Pool', index=False)
    orders_df.to_excel(writer, sheet_name='Orders', index=False)
    drivers_df.to_excel(writer, sheet_name='Drivers', index=False)
    schedule_df.to_excel(writer, sheet_name='Schedule', index=False)
    vacations_df.to_excel(writer, sheet_name='Vacations', index=False)
    event_description_df.to_excel(writer, sheet_name='Event Description', index=False)
    unavailability_df.to_excel(writer, sheet_name='unavailability', index=False)