<a href="https://colab.research.google.com/github/bhanulk/supplier_reliability_prediction/blob/main/dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters
n_suppliers = 150
n_orders = 15000
start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 12, 31)

# Supplier metadata
regions = ["Aluva", "Kochi", "Kothamanglam", "Muvattupuzha","Baypur","Palakkad"]
categories = ["Raw_Materials", "Packaging", "Components", "MRO"]
supplier_ids = [f"SUPP_{i:04d}" for i in range(1, n_suppliers+1)]

supplier_meta = {
    sup: {
        "Region": random.choice(regions),
        "Item_Category": random.choice(categories),
        "Capacity_per_month": np.random.randint(500, 5000),
        "Min_Order_Qty": np.random.randint(50, 200),
        "Max_Order_Qty": np.random.randint(2000, 6000)
    }
    for sup in supplier_ids
}

# Helper function
def random_date(start, end):
    """Generate random datetime between `start` and `end`"""
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))

def generate_order(i):
    PO_ID = f"PO_{i:05d}"
    supplier = random.choice(supplier_ids)
    meta = supplier_meta[supplier]

    Order_Date = random_date(start_date, end_date)
    Promised_Lead_Time_days = np.random.randint(5, 30)
    Requested_Delivery_Date = Order_Date + timedelta(days=int(Promised_Lead_Time_days))

    # Actual delivery with possible delay or cancellation
    if np.random.rand() < 0.1:  # 10% chance of cancellation
        Delivery_Date = None
        Order_Status = "Cancelled"
        Quantity_Delivered = 0
        Shipping_Delay_days = None
    else:
        delay = int(np.random.choice([0, 0, 0, 1, 2, 3, 5, 7, 10],
                                     p=[0.5,0.1,0.1,0.1,0.05,0.05,0.05,0.03,0.02]))
        Delivery_Date = Requested_Delivery_Date + timedelta(days=delay)
        Order_Status = "Delivered" if delay < 15 else "Late"
        Quantity_Delivered = np.random.randint(meta["Min_Order_Qty"], meta["Max_Order_Qty"])
        Shipping_Delay_days = delay

    Quantity_Ordered = np.random.randint(meta["Min_Order_Qty"], meta["Max_Order_Qty"])

    # Pricing
    Negotiated_Price = round(np.random.uniform(10, 100), 2)
    Unit_Price = round(Negotiated_Price * np.random.uniform(0.95, 1.1), 2)

    # Defects
    Defective_Units = int(np.random.poisson(lam=Quantity_Delivered * 0.02)) if Quantity_Delivered > 0 else None

    # Compliance
    Compliance = "Yes" if np.random.rand() > 0.1 else "No"

    # Delay reason
    Reason_for_delay = None
    if Delivery_Date and Delivery_Date > Requested_Delivery_Date:
        Reason_for_delay = random.choice(["Logistics", "Raw_Material_Shortage", "Customs", "Weather", "Supplier_Issue"])

    # Other fields
    Urgency = random.choice(["Normal", "Urgent", "Critical"])
    Recorded_Communication_ResponseTime_hrs = round(np.random.uniform(1, 48), 1)

    return [
        PO_ID, supplier, Order_Date.strftime("%Y-%m-%d"), Requested_Delivery_Date.strftime("%Y-%m-%d"),
        Promised_Lead_Time_days,
        Delivery_Date.strftime("%Y-%m-%d") if Delivery_Date else None,
        Order_Status, Quantity_Ordered, Quantity_Delivered, Unit_Price, Negotiated_Price,
        Defective_Units, Compliance, Reason_for_delay, meta["Region"], meta["Item_Category"],
        Urgency, Recorded_Communication_ResponseTime_hrs, meta["Capacity_per_month"],
        meta["Min_Order_Qty"], meta["Max_Order_Qty"], Shipping_Delay_days
    ]

# Generate dataset
orders = [generate_order(i) for i in range(1, n_orders+1)]

columns = [
    "PO_ID", "Supplier_ID", "Order_Date", "Requested_Delivery_Date", "Promised_Lead_Time_days",
    "Delivery_Date", "Order_Status", "Quantity_Ordered", "Quantity_Delivered", "Unit_Price",
    "Negotiated_Price", "Defective_Units", "Compliance", "Reason_for_delay", "Region",
    "Item_Category", "Urgency", "Recorded_Communication_ResponseTime_hrs", "Capacity_per_month",
    "Min_Order_Qty", "Max_Order_Qty","Shipping_Delay_days"
]

synthetic_df = pd.DataFrame(orders, columns=columns)

# Save to CSV
file_path = "synthetic_supplier_dataset.csv"
synthetic_df.to_csv(file_path, index=False)

file_path, synthetic_df.head()


('synthetic_supplier_dataset.csv',
       PO_ID Supplier_ID  Order_Date Requested_Delivery_Date  \
 0  PO_00001   SUPP_0108  2023-11-05              2023-11-23   
 1  PO_00002   SUPP_0145  2023-06-20              2023-07-04   
 2  PO_00003   SUPP_0067  2022-07-29              2022-08-13   
 3  PO_00004   SUPP_0068  2023-02-10              2023-03-03   
 4  PO_00005   SUPP_0077  2023-04-14              2023-05-04   
 
    Promised_Lead_Time_days Delivery_Date Order_Status  Quantity_Ordered  \
 0                       18          None    Cancelled              2732   
 1                       14          None    Cancelled               490   
 2                       15    2022-08-14    Delivered              3004   
 3                       21    2023-03-03    Delivered              4047   
 4                       20    2023-05-04    Delivered              4229   
 
    Quantity_Delivered  Unit_Price  ...  Compliance  Reason_for_delay  \
 0                   0       65.67  ...         

In [2]:
# Fill NaN values in 'Defective_Units' with 0
synthetic_df['Defective_Units'].fillna(0, inplace=True)

# Calculate Defective Rate
synthetic_df['Defective_Rate'] = synthetic_df['Defective_Units'] / synthetic_df['Quantity_Delivered']

# Handle cases where Quantity_Delivered is 0 to avoid division by zero
synthetic_df['Defective_Rate'].replace([np.inf, -np.inf], np.nan, inplace=True)
synthetic_df['Defective_Rate'].fillna(0, inplace=True)
file_path = "synthetic_supplier_dataset.csv"
synthetic_df.to_csv(file_path, index=False)

display(synthetic_df.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  synthetic_df['Defective_Units'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  synthetic_df['Defective_Rate'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

Unnamed: 0,PO_ID,Supplier_ID,Order_Date,Requested_Delivery_Date,Promised_Lead_Time_days,Delivery_Date,Order_Status,Quantity_Ordered,Quantity_Delivered,Unit_Price,...,Reason_for_delay,Region,Item_Category,Urgency,Recorded_Communication_ResponseTime_hrs,Capacity_per_month,Min_Order_Qty,Max_Order_Qty,Shipping_Delay_days,Defective_Rate
0,PO_00001,SUPP_0108,2023-11-05,2023-11-23,18,,Cancelled,2732,0,65.67,...,,Kochi,MRO,Critical,19.1,3675,186,5882,,0.0
1,PO_00002,SUPP_0145,2023-06-20,2023-07-04,14,,Cancelled,490,0,34.28,...,,Baypur,Raw_Materials,Urgent,17.4,3587,73,2335,,0.0
2,PO_00003,SUPP_0067,2022-07-29,2022-08-13,15,2022-08-14,Delivered,3004,3264,60.12,...,Customs,Palakkad,Raw_Materials,Normal,7.0,1159,196,3369,1.0,0.020527
3,PO_00004,SUPP_0068,2023-02-10,2023-03-03,21,2023-03-03,Delivered,4047,4328,51.65,...,,Palakkad,Components,Normal,24.7,2486,196,5219,0.0,0.021719
4,PO_00005,SUPP_0077,2023-04-14,2023-05-04,20,2023-05-04,Delivered,4229,453,45.86,...,,Palakkad,Packaging,Urgent,37.7,4590,58,5304,0.0,0.015453


In [3]:
# Define thresholds for defective rate
low_defective_threshold = 0.01
high_defective_threshold = 0.05

# Create 'Product_Quality' feature based on 'Defective_Rate' and 'Compliance'
def categorize_quality(row):
    if row['Compliance'] == 'Yes' and row['Defective_Rate'] <= low_defective_threshold:
        return 'High Quality'
    elif row['Compliance'] == 'No' or row['Defective_Rate'] >= high_defective_threshold:
        return 'Low Quality'
    else:
        return 'Medium Quality'

synthetic_df['Product_Quality'] = synthetic_df.apply(categorize_quality, axis=1)

# Encode 'Product_Quality' numerically
quality_mapping = {'Low Quality': 0, 'Medium Quality': 1, 'High Quality': 2}
synthetic_df['Product_Quality_Encoded'] = synthetic_df['Product_Quality'].map(quality_mapping)
file_path = "synthetic_supplier_dataset.csv"
synthetic_df.to_csv(file_path, index=False)
display(synthetic_df.head())

Unnamed: 0,PO_ID,Supplier_ID,Order_Date,Requested_Delivery_Date,Promised_Lead_Time_days,Delivery_Date,Order_Status,Quantity_Ordered,Quantity_Delivered,Unit_Price,...,Item_Category,Urgency,Recorded_Communication_ResponseTime_hrs,Capacity_per_month,Min_Order_Qty,Max_Order_Qty,Shipping_Delay_days,Defective_Rate,Product_Quality,Product_Quality_Encoded
0,PO_00001,SUPP_0108,2023-11-05,2023-11-23,18,,Cancelled,2732,0,65.67,...,MRO,Critical,19.1,3675,186,5882,,0.0,High Quality,2
1,PO_00002,SUPP_0145,2023-06-20,2023-07-04,14,,Cancelled,490,0,34.28,...,Raw_Materials,Urgent,17.4,3587,73,2335,,0.0,High Quality,2
2,PO_00003,SUPP_0067,2022-07-29,2022-08-13,15,2022-08-14,Delivered,3004,3264,60.12,...,Raw_Materials,Normal,7.0,1159,196,3369,1.0,0.020527,Medium Quality,1
3,PO_00004,SUPP_0068,2023-02-10,2023-03-03,21,2023-03-03,Delivered,4047,4328,51.65,...,Components,Normal,24.7,2486,196,5219,0.0,0.021719,Medium Quality,1
4,PO_00005,SUPP_0077,2023-04-14,2023-05-04,20,2023-05-04,Delivered,4229,453,45.86,...,Packaging,Urgent,37.7,4590,58,5304,0.0,0.015453,Medium Quality,1


In [4]:
# Convert date columns to datetime objects
synthetic_df['Requested_Delivery_Date'] = pd.to_datetime(synthetic_df['Requested_Delivery_Date'])
synthetic_df['Delivery_Date'] = pd.to_datetime(synthetic_df['Delivery_Date'])

# Calculate Delivery Delay
synthetic_df['Delivery_Delay_days'] = (synthetic_df['Delivery_Date'] - synthetic_df['Requested_Delivery_Date']).dt.days

# Fill NaN values in 'Delivery_Delay_days' with 0 (assuming no delivery means no delay for calculation purposes, or could be a specific value like -1)
# Let's fill with 0 for now, representing no delay if cancelled or not delivered
synthetic_df['Delivery_Delay_days'].fillna(0, inplace=True)
file_path = "synthetic_supplier_dataset.csv"
synthetic_df.to_csv(file_path, index=False)
display(synthetic_df[['Requested_Delivery_Date', 'Delivery_Date', 'Delivery_Delay_days']].head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  synthetic_df['Delivery_Delay_days'].fillna(0, inplace=True)


Unnamed: 0,Requested_Delivery_Date,Delivery_Date,Delivery_Delay_days
0,2023-11-23,NaT,0.0
1,2023-07-04,NaT,0.0
2,2022-08-13,2022-08-14,1.0
3,2023-03-03,2023-03-03,0.0
4,2023-05-04,2023-05-04,0.0


In [5]:
# Create unique identifiers for Item_Category
item_category_map = {category: i for i, category in enumerate(synthetic_df['Item_Category'].unique())}
synthetic_df['Item_Category_ID'] = synthetic_df['Item_Category'].map(item_category_map)

# Create unique identifiers for Product_Quality
product_quality_map = {quality: i for i, quality in enumerate(synthetic_df['Product_Quality'].unique())}
synthetic_df['Product_Quality_ID'] = synthetic_df['Product_Quality'].map(product_quality_map)

file_path = "synthetic_supplier_dataset.csv"
synthetic_df.to_csv(file_path, index=False)
display(synthetic_df[['Item_Category', 'Item_Category_ID', 'Product_Quality', 'Product_Quality_ID']].head())

Unnamed: 0,Item_Category,Item_Category_ID,Product_Quality,Product_Quality_ID
0,MRO,0,High Quality,0
1,Raw_Materials,1,High Quality,0
2,Raw_Materials,1,Medium Quality,1
3,Components,2,Medium Quality,1
4,Packaging,3,Medium Quality,1
