# EfficientRide Logistics Dataset Overview

The dataset represents the day-to-day operations of EfficientRide Logistics. Here's a summary of its components:

## Orders

Each entry in the dataset represents an individual order. Key information recorded includes:
- Order ID
- Pickup date
- Lead time
- Volume
- Status (Delivered, On the Way, Not Picked Up)

## Delivery

Additional details related to the delivery are included:
- Delivery type (Direct or Not Direct)
- Delivery date
- Pickup address
- Delivery location
- Trailer requirement for each order

## Equipment and Drivers

Each order is associated with specific equipment and a driver. The dataset includes:
- Unique ID for each piece of equipment
- Unique ID for each driver

## Events, Cost, and Profit

The dataset also records:
- Any issues that occurred during the delivery process (Events)
- The cost incurred for each order
- The profit generated from each order

## Location Data

Detailed location data is included, with:
- Latitude and longitude for each location
- Location types: Ports, Customer DC, Customer Stores, and Carrier Warehouses

## Driver Information

Detailed information about each driver is included, such as:
- Hire date
- The type of license they hold

This dataset provides a comprehensive view of the EfficientRide Logistics' operations, from order pickup to delivery. It demonstrates the company's meticulous approach to recording data, highlighting its commitment to operational efficiency and customer service.



In [None]:
# Import required packages
import random
import pandas as pd
from faker import Faker
import datetime

In [None]:

# Define number of locations of each type
num_ports = 1
num_customer_dcs = 2
num_customer_stores = 30
num_carrier_warehouses = 1
# Define number of trailers of each type
num_trailers = 30
num_refrigs = 5
num_city_delivs = 40
# Define number of trailers of each type
num_drivers = 100
pct_trailer_driver=0.5


In [None]:

# Initialize Faker
fake = Faker()

# Set seed for reproducibility
random.seed(42)


# Function to generate random latitude and longitude around a given point
def generate_coords(lat, long, delta=0.1):
    return round(lat + random.uniform(-delta, delta), 4), round(long + random.uniform(-delta, delta), 4)

# Create Locations DataFrame
locations_data = {
    'Location Name': [],
    'Location Type': ['Port']*num_ports + ['Customer DC']*num_customer_dcs + ['Customer Store']*num_customer_stores + ['Carrier Warehouse']*num_carrier_warehouses,
    'Location Latitude': [],
    'Location Longitude': []
}

for i, loc_type in enumerate(locations_data['Location Type']):
    loc_name = f'{loc_type} Location {i+1}'
    locations_data['Location Name'].append(loc_name)
    lat, long = generate_coords(45.5019, -73.5674)
    locations_data['Location Latitude'].append(lat)
    locations_data['Location Longitude'].append(long)

locations_df = pd.DataFrame(locations_data)


# Create Trailers DataFrame
trailers_data = {
    'Equipment ID': [f'TR-{str(i).zfill(3)}' for i in range(1, num_trailers + 1)] + [f'RE-{str(i).zfill(3)}' for i in range(1, num_refrigs + 1)] + [f'CD-{str(i).zfill(3)}' for i in range(1, num_city_delivs + 1)],
    'Equipment Type': ['Trailer']*num_trailers + ['Refrigerated Truck']*num_refrigs + ['City Delivery']*num_city_delivs,
    'Capacity': [2000]*num_trailers + [2000]*num_refrigs + [700]*num_city_delivs
}

trailers_df = pd.DataFrame(trailers_data)

# Create Orders DataFrame
orders_data = {
    'Order ID': [],
    'Pickup Date': [],
    'Lead Time': [],
    'Volume': []
}

# Generate dates and orders for the past 2 years, with 75-120 orders each day
start_date = datetime.datetime.today() - datetime.timedelta(days=2 * 365)
dates = pd.date_range(start=start_date, end=datetime.datetime.today() + datetime.timedelta(days=14))

# Define trend and seasonality patterns
trend_pattern = [85, 90, 95, 100, 105, 110, 115, 120]
seasonality_pattern = [0.8, 0.9, 1.1, 1.2, 1.3, 1.4, 1.3, 1.2, 1.1, 1.0, 0.9, 0.8]

for i, date in enumerate(dates):
    # Calculate trend value based on the pattern
    trend_value = trend_pattern[i % len(trend_pattern)]

    # Calculate seasonality value based on the pattern
    seasonality_value = seasonality_pattern[date.month - 1]

    # Calculate the number of orders based on trend and seasonality
    num_orders = int(trend_value * seasonality_value)

    for _ in range(num_orders):
        orders_data['Order ID'].append(len(orders_data['Order ID']) + 1)
        orders_data['Pickup Date'].append(date)
        orders_data['Lead Time'].append(random.randint(0, 5))
        orders_data['Volume'].append(random.randint(200, 2000))

orders_df = pd.DataFrame(orders_data)

# Add other fields based on initial fields
orders_df['Delivery Type'] = orders_df['Lead Time'].apply(lambda x: 'Not Direct' if x > 1 else 'Direct')
orders_df['Delivery Date'] = (pd.to_datetime(orders_df['Pickup Date']) + pd.to_timedelta(orders_df['Lead Time'], unit='d')).dt.date
orders_df['Pickup Address'] = random.choices(locations_df[locations_df['Location Type'].isin(['Port', 'Customer DC'])]['Location Name'].values, k=len(orders_df))
orders_df['Delivery Location'] = random.choices(locations_df[locations_df['Location Type'] == 'Customer Store']['Location Name'].values, k=len(orders_df))

orders_df['Status'] = orders_df['Pickup Date'].apply(lambda x: 'Delivered' if x.date() < datetime.datetime.today().date() else ('On The Way' if x.date() == datetime.datetime.today().date() else 'Not Picked Up'))


# Add Trailer Requirement based on Volume
def trailer_requirement(volume):
    if volume < 700:
        return 'City Delivery'
    elif random.random() < 0.05:   # 5% probability of needing a refrigerated trailer
        return 'Refrigerated Truck'
    else:
        return 'Trailer'

orders_df['Trailer Requirement'] = orders_df['Volume'].apply(trailer_requirement)

# Driver information
drivers_data = {
    'Driver ID': [f'DR-{str(i).zfill(3)}' for i in range(1, num_drivers + 1)],
    'Driver Name': [fake.name() for _ in range(num_drivers)],
    'Hired Date': [fake.date_between(start_date='-5y', end_date='today') for _ in range(num_drivers)],
    'License Type': ['Trailer' if random.random() < pct_trailer_driver else 'Truck' for _ in range(num_drivers)],
}

drivers_df = pd.DataFrame(drivers_data)

# Initialize Schedule DataFrame
schedule_data = {
    'Date': [],
    'Driver ID': [],
    'Equipment ID': []
}
schedule_df = pd.DataFrame(schedule_data)

# Assign drivers to equipment
for _, equipment in trailers_df.iterrows():
    # For simplicity, let's assign a driver to each equipment for every day in a 6-month period
    date_range = pd.date_range(start=pd.to_datetime(fake.date_between(start_date='-6m', end_date='today')), end=datetime.datetime.today())

    for date in date_range:
        # Check if this equipment has already been assigned a driver for this day
        if not schedule_df[(schedule_df['Date'] == date) & (schedule_df['Equipment ID'] == equipment['Equipment ID'])].empty:
            continue

        # Choose a random available driver
        available_drivers = set(drivers_df['Driver ID'])

        if not schedule_df[schedule_df['Date'] == date].empty:
            assigned_drivers = set(schedule_df[schedule_df['Date'] == date]['Driver ID'])
            available_drivers -= assigned_drivers

        # If there are no available drivers, skip this day
        if not available_drivers:
            continue

        assigned_driver = random.choice(list(available_drivers))

        # Add the assignment to the schedule
        schedule_df = schedule_df.append({
            'Date': date,
            'Driver ID': assigned_driver,
            'Equipment ID': equipment['Equipment ID']
        }, ignore_index=True)




This code is responsible for scheduling orders. The order data includes information like the pickup date, volume, and type of trailer required. The schedule will include the date of the order, the order ID, the driver assigned to the order, and the equipment (trailer or truck) used for the order.

An empty DataFrame schedule_df is created, which will later be filled with scheduling details for each order.

For each order in orders_df, the code checks if the pickup date has already passed. If not, it skips the current iteration of the loop and moves on to the next order.

The code retrieves a subset of equipment available for the order based on the trailer requirement. If there is no equipment available that fits the order's needs, the code skips the current iteration of the loop.

A piece of equipment is randomly selected from the available options, and the code checks if there's enough capacity on the chosen equipment for the order. It does this by summing the volumes of all orders already assigned to that equipment on the pickup date and comparing it to the equipment's capacity. If there isn't enough capacity left for the current order, the loop iteration is skipped.

The code generates a set of drivers who are qualified to operate the chosen equipment.

For each driver, the code checks if they are on vacation or already assigned to another order on the pickup date. If so, the driver is removed from the set of available drivers.

If there are no available drivers left after the above checks, the loop iteration is skipped.

A driver is then randomly selected from the available options.

If the order's delivery type is "Not Direct", the location is set to "Carrier Warehouse" on the pickup date. Otherwise, for "Direct" deliveries, the location is set to the actual delivery location.

The order details, including the order ID, pickup date, assigned driver, and assigned equipment, are appended to the schedule DataFrame.

For "Not Direct" deliveries, another row is added to the schedule for the delivery from the Carrier Warehouse to the Customer Store on the delivery date.

By the end of this process, every order that can be fulfilled (given the constraints of driver availability, equipment availability and capacity) has been assigned a driver and equipment, and these details have been added to schedule_df. The schedule also reflects whether the delivery is direct or not, with non-direct deliveries being routed through the carrier's warehouse.






In [None]:
# Initialize Schedule DataFrame
schedule_data = {
    'Date': [],
    'Order ID': [],
    'Driver ID': [],
    'Equipment ID': [],
    'Volume': [],
    'Location': []
}
schedule_df = pd.DataFrame(schedule_data)

# Assign drivers to orders based on equipment requirement, availability, and capacity
for _, order in orders_df.iterrows():
    # Only assign drivers to orders that have a pickup date in the past
    if order['Pickup Date'].date() > today:
        continue

    # Get available equipment for this order
    available_equipment = trailers_df[trailers_df['Equipment Type'] == order['Trailer Requirement']]

    # If there's no available equipment, skip this order
    if available_equipment.empty:
        continue

    # Choose a random equipment
    assigned_equipment = available_equipment.sample(1).iloc[0]

    # Check if the equipment has enough capacity for this order
    if not schedule_df[schedule_df['Date'] == order['Pickup Date']].empty:
        assigned_orders = schedule_df[schedule_df['Date'] == order['Pickup Date']]
        assigned_volume = assigned_orders[assigned_orders['Equipment ID'] == assigned_equipment['Equipment ID']]['Volume'].sum()
        remaining_capacity = assigned_equipment['Capacity'] - assigned_volume
        if remaining_capacity < order['Volume']:
            continue

    # Get available drivers for this order
    available_drivers = set(drivers_df[(drivers_df['License Type'] == 'Trailer') if assigned_equipment['Equipment Type'] in ['Trailer', 'Refrigerated Truck'] else (drivers_df['License Type'] == 'Truck')]['Driver ID'])

    # Check if the driver is on vacation or already assigned to another order on the pickup date
    for _, vacation in vacations_df[vacations_df['Driver ID'].isin(available_drivers)].iterrows():
        if vacation['Start Date'] <= order['Pickup Date'] <= vacation['End Date'] and vacation['Driver ID'] in available_drivers:
            available_drivers.remove(vacation['Driver ID'])
    if not schedule_df[schedule_df['Date'] == order['Pickup Date']].empty:
        assigned_drivers = set(schedule_df[schedule_df['Date'] == order['Pickup Date']]['Driver ID'])
        available_drivers -= assigned_drivers

    # If there are no available drivers, skip this order
    if not available_drivers:
        continue

    # Choose a random driver
    assigned_driver = random.choice(list(available_drivers))

    # Check if the delivery type is "Not Direct"
    if order['Delivery Type'] == 'Not Direct':
        # Set the location to Carrier Warehouse on the Pickup Date
        location = 'Carrier Warehouse'
    else:
        # If the delivery type is "Direct", set the location to the actual delivery location
        location = order['Delivery Location']

    # Add the assignment to the schedule
    schedule_df = schedule_df.append({
        'Date': order['Pickup Date'],
        'Order ID': order['Order ID'],
        'Driver ID': assigned_driver,
        'Equipment ID': assigned_equipment['Equipment ID'],
        'Volume': order['Volume'],
        'Location': location
    }, ignore_index=True)

# For "Not Direct" deliveries, add another row in the schedule for the delivery from the Carrier Warehouse to the Customer Store on the Delivery Date
not_direct_orders = orders_df[orders_df['Delivery Type'] == 'Not Direct']
for _, order in not_direct_orders.iterrows():
    # Get the driver and equipment assigned to this order on the Pickup Date
    assigned_driver = schedule_df[(schedule_df['Date'] == order['Pickup Date']) & (schedule_df['Order ID'] == order['Order ID'])]['Driver ID'].values[0]
    assigned_equipment = schedule_df[(schedule_df['Date'] == order['Pickup Date']) & (schedule_df['Order ID'] == order['Order ID'])]['Equipment ID'].values[0]

    # Add the assignment to the schedule for the Delivery Date
    schedule_df = schedule_df.append({
        'Date': order['Delivery Date'],
        'Order ID': order['Order ID'],
        'Driver ID': assigned_driver,
        'Equipment ID': assigned_equipment,
        'Volume': order['Volume'],
        'Location': order['Delivery Location']
    }, ignore_index=True)


1. The code begins by dropping the 'Driver ID' and 'Equipment ID' columns from `orders_df` if they already exist. This is done to prevent duplicate columns when we merge `orders_df` with `schedule_df`.

2. The 'Date' column in `schedule_df` is converted to datetime to ensure consistency across dataframes.

3. The `orders_df` is then merged with `schedule_df` on the 'Order ID' and 'Pickup Date'. The merge is a left merge, meaning all rows from `orders_df` will be included in the final dataframe, and matching rows from `schedule_df` will be added where available. 

4. After the merge, the redundant 'Date' column from `schedule_df` is dropped.

5. Three new columns are added to `orders_df`: 'Event', 'Cost', and 'Profit'. 'Event' is initialized to 0, indicating 'No issue'. 'Cost' and 'Profit' are calculated using arbitrary formulas involving the 'Volume' and 'Lead Time' of the orders.

6. The code then iterates over each order in `orders_df`. If the order's status is not 'Delivered', it skips the current iteration.

7. For each delivered order, the code first retrieves the seniority of the assigned driver from `drivers_df`. If no matching driver is found, the driver's experience is defaulted to 0.

8. The code then generates an event based on various conditions related to the order's volume, the trailer requirement, and the driver's experience. For example, if the order's volume is greater than 1900, there's a 5% chance that the event will be set to 3, indicating a cancellation. Depending on the event, a penalty is also deducted from the order's profit.

9. Finally, an event description dataframe (`event_description_df`) is created, mapping each event number to a description.

The end result of this section is an updated `orders_df` that includes the assigned driver and equipment for each order, the cost and profit of each order, and any event that occurred during the delivery of the order. This allows for further analysis of the logistics operations.

In [None]:
# Drop columns if they already exist in orders_df
if 'Driver ID' in orders_df.columns:
    orders_df.drop(columns='Driver ID', inplace=True)
if 'Equipment ID' in orders_df.columns:
    orders_df.drop(columns='Equipment ID', inplace=True)

# Convert 'Date' in schedule_df to datetime
schedule_df['Date'] = pd.to_datetime(schedule_df['Date'])

#
orders_df = orders_df.merge(schedule_df[['Date', 'Order ID', 'Driver ID', 'Equipment ID']], 
                            how='left', 
                            left_on=['Order ID', 'Pickup Date'], 
                            right_on=['Order ID', 'Date'])

orders_df.drop(columns='Date', inplace=True)

orders_df['Event'] = 0  # default to 'No issue'
orders_df['Cost'] = orders_df['Volume'] * 0.1 + orders_df['Lead Time'] * 50  # arbitrary cost calculation
orders_df['Profit'] = orders_df['Volume'] * 0.2 - orders_df['Cost']  # arbitrary profit calculation

# Generate events
for i, order in orders_df.iterrows():
    if order['Status'] != 'Delivered':
        continue
# Get driver experience
driver_experience_values = drivers_df.loc[drivers_df['Driver ID'] == order['Driver ID'], 'Seniority'].values

if driver_experience_values.size > 0:
    driver_experience = driver_experience_values[0]
else:
    driver_experience = 0  # or some other default value

    # Generate event based on correlations
    if order['Volume'] > 1900 and random.random() < 0.05:  # 5% chance of cancellation for very high volume orders
        orders_df.at[i, 'Event'] = 3
        orders_df.at[i, 'Profit'] -= 5000  # arbitrary penalty for cancellation
    elif order['Trailer Requirement'] == 'Refrigerated Truck' and random.random() < 0.02:  # 2% chance of damage for refrigerated truck orders
        orders_df.at[i, 'Event'] = 4
        orders_df.at[i, 'Profit'] -= 2000  # arbitrary penalty for damage
    elif driver_experience < 2 and random.random() < 0.01:  # 1% chance of accident for inexperienced drivers
        orders_df.at[i, 'Event'] = 5
        orders_df.at[i, 'Profit'] -= 5000  # arbitrary penalty for accident
    elif order['Volume'] > 1000 and random.random() < 0.1:  # 10% chance of complaint for high volume orders
        orders_df.at[i, 'Event'] = 2
        orders_df.at[i, 'Profit'] -= 500  # arbitrary penalty for complaint
    elif order['Lead Time'] > 3 and random.random() < 0.2:  # 20% chance of delay for long lead time orders
        orders_df.at[i, 'Event'] = 1
        orders_df.at[i, 'Profit'] -= 100  # arbitrary penalty for delay

# Create Event Description DataFrame
event_description_df = pd.DataFrame({
    'Event': range(6),
    'Description': ['No issue', 'Delay', 'Complaint', 'Cancellation', 'Damage', 'Accident']
})



In [None]:
# Save to Excel
with pd.ExcelWriter('EfficientRide_Dataset2.xlsx', engine='openpyxl') as writer:
    locations_df.to_excel(writer, sheet_name='Locations', index=False)
    trailers_df.to_excel(writer, sheet_name='Trailers Pool', index=False)
    orders_df.to_excel(writer, sheet_name='Orders', index=False)
    drivers_df.to_excel(writer, sheet_name='Drivers', index=False)
    schedule_df.to_excel(writer, sheet_name='Schedule', index=False)
    vacations_df.to_excel(writer, sheet_name='Vacations', index=False)
    event_description_df.to_excel(writer, sheet_name='Event Description', index=False)
