In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Configuration 
NUM_ROWS = 1000  

product_categories = ['Electronics', 'Apparel', 'Home Goods', 'Books', 'Groceries', 'Toys']
carriers = ['FedEx', 'UPS', 'DHL', 'USPS']
order_statuses = ['Delivered', 'Shipped', 'Pending', 'Canceled']
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose']
states = ['NY', 'CA', 'IL', 'TX', 'AZ', 'PA']

# Generate Data
data = {}

# Generate Order IDs
data['Order ID'] = range(10001, 10001 + NUM_ROWS)

# Generate Dates
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=np.random.randint(0, 365)) for _ in range(NUM_ROWS)]
data['Order Date'] = dates
data['Shipment Date'] = [d + timedelta(days=np.random.randint(1, 3)) for d in dates]
data['Delivery Date'] = [d + timedelta(days=np.random.randint(2, 7)) for d in data['Shipment Date']]

# Generate Categorical Data
data['Product Category'] = np.random.choice(product_categories, size=NUM_ROWS)
data['Carrier'] = np.random.choice(carriers, size=NUM_ROWS)
data['Order Status'] = np.random.choice(order_statuses, size=NUM_ROWS, p=[0.75, 0.15, 0.05, 0.05]) # Weighted distribution

# Generate Location Data
data['Customer City'] = np.random.choice(cities, size=NUM_ROWS)
data['Customer State'] = np.random.choice(states, size=NUM_ROWS)

# Generate Numerical Data
data['Quantity'] = np.random.randint(1, 6, size=NUM_ROWS)
data['Unit Price'] = np.round(np.random.uniform(10.0, 500.0, size=NUM_ROWS), 2)
data['Shipping Cost'] = np.round(np.random.uniform(5.0, 20.0, size=NUM_ROWS), 2)

# Create DataFrame 
df = pd.DataFrame(data)

# Add Calculated Columns
# Total Revenue
df['Total Revenue'] = df['Quantity'] * df['Unit Price']

# Shipping Time (in days)
df['Shipping Time (Days)'] = (df['Delivery Date'] - df['Shipment Date']).dt.days

# ntroduce Data Inconsistencies (for cleansing demo) 
# Intentionally misspell some carrier names
df.loc[df.sample(frac=0.03).index, 'Carrier'] = 'FedEX'
df.loc[df.sample(frac=0.02).index, 'Carrier'] = 'UPSS'

# Intentionally misspell some statuses
df.loc[df.sample(frac=0.02).index, 'Order Status'] = 'delivered'
df.loc[df.sample(frac=0.01).index, 'Order Status'] = 'shippedd'

# Save to CSV 
file_path = 'supply_chain_data.csv'
df.to_csv(file_path, index=False)

print(f"Synthetic dataset with {NUM_ROWS} rows has been successfully created and saved to {file_path}")

Synthetic dataset with 1000 rows has been successfully created and saved to supply_chain_data.csv
