In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import itertools
import random

In [2]:
np.random.seed(42)

In [3]:
output_dir = "../data/synthetic_data/"

In [4]:
# SKU Master
categories = ['Electronics', 'Fashion', 'Home', 'Toys', 'Sports', 'Beauty', 'Books', 'Automotive', 'Grocery', 'Garden']
subcategories = {
    'Electronics':['Smartphones','Laptops','Tablets'],
    'Fashion':['Shoes','Jackets','Shirts'],
    'Home':['Kettles','Furniture','Decor'],
    'Toys':['Action Figures','Board Games','Puzzles'],
    'Sports':['Bats','Balls','Gloves'],
    'Beauty':['Skincare','Makeup','Perfume'],
    'Books':['Fiction','Non-fiction','Comics'],
    'Automotive':['Accessories','Tools','Parts'],
    'Grocery':['Snacks','Beverages','Condiments'],
    'Garden':['Tools','Plants','Decor']
}

sku_list = []
for i in range(1,259):
    cat = np.random.choice(categories)
    subcat = np.random.choice(subcategories[cat])
    lead_time = np.random.randint(5,31)
    sku_list.append([f'SKU{i:03}', cat, subcat, lead_time])
sku_master = pd.DataFrame(sku_list, columns=['sku_id', 'category', 'subcategory', 'lead_time_default'])
sku_master.to_csv(output_dir + "sku_master.csv", index=False)

In [5]:
# Suppliers
supplier_list = [f'SUP{i:03}' for i in range(1,16)]
supplier_names = [f'Supplier_{i}' for i in range(1,16)]
lead_time_override = [int(np.random.randint(5,30)) if np.random.rand() < 0.5 else None for _ in range(15)]
suppliers = pd.DataFrame({'supplier_id':supplier_list, 'supplier_name':supplier_names, 'lead_time_days_override':lead_time_override})
suppliers.to_csv(output_dir + "suppliers.csv", index=False)

In [6]:
# Inventory
skus = sku_master['sku_id'].tolist()
warehouse_list = [f"WH{i}" for i in range(1, 16)]

# Generate all possible SKU–Warehouse pairs
all_pairs = list(itertools.product(skus, warehouse_list))

# Randomly sample 550 unique pairs
chosen_pairs = random.sample(all_pairs, 574)

# Build inventory records with your richer logic
inventory_list = []
for sku, wh in chosen_pairs:
    current_stock = np.random.randint(50, 1200)
    reserved_stock = np.random.randint(0, current_stock // 5)
    unit_cost = np.random.randint(10, 1000)
    unit_price = int(unit_cost * 1.3 + np.random.randint(-5, 5))
    inventory_list.append([sku, wh, current_stock, reserved_stock, unit_cost, unit_price])

# Convert to DataFrame
inventory = pd.DataFrame(
    inventory_list,
    columns=['sku_id', 'warehouse_id', 'current_stock', 'reserved_stock', 'unit_cost', 'unit_price']
)

# Save
inventory.to_csv(output_dir + "inventory.csv", index=False)

In [7]:
# Supplier SKU mapping
supplier_sku_map = []
for sku in sku_master['sku_id']:
    num_suppliers = np.random.randint(1,6)
    chosen_suppliers = np.random.choice(suppliers['supplier_id'], num_suppliers, replace=False)
    for sup in chosen_suppliers:
        supplier_sku_map.append([sku, sup])
supplier_sku_map = pd.DataFrame(supplier_sku_map, columns=['sku_id', 'supplier_id'])
supplier_sku_map.to_csv(output_dir + "supplier_sku_map.csv", index=False)

# Purchase Orders
po_list = []
for i in range(1,1679):
    sku = np.random.choice(sku_master['sku_id'])
    
    # Get suppliers allowed for this SKU
    possible_suppliers = supplier_sku_map.loc[supplier_sku_map['sku_id'] == sku, 'supplier_id'].tolist()
    supplier = np.random.choice(possible_suppliers)
    
    po_date = datetime(2025,1,1) + timedelta(days=np.random.randint(0,180))
    lead_time = sku_master.loc[sku_master['sku_id']==sku,'lead_time_default'].values[0]
    exp_delivery = po_date + timedelta(days=int(lead_time + np.random.randint(-3,4)))
    
    if np.random.rand() < 0.8:
        actual_delivery = exp_delivery + timedelta(days=np.random.randint(-1,5))
        actual_delivery = actual_delivery.date()
    else:
        actual_delivery = None
    
    quantity_ordered = np.random.randint(100,2000)
    po_list.append([sku, supplier, po_date.date(), exp_delivery.date(), actual_delivery, quantity_ordered])
purchase_orders = pd.DataFrame(po_list, columns=['sku_id', 'supplier_id', 'po_date', 'expected_delivery_date', 'actual_delivery_date', 'quantity_ordered'])
purchase_orders.to_csv(output_dir + "purchase_orders.csv", index=False)

In [8]:
sales_list = []
start_date = datetime(2025,1,1)
for _ in range(5656):
    inv_row = inventory.sample(1).iloc[0]
    sku = inv_row['sku_id']
    wh = inv_row['warehouse_id']
    date = start_date + timedelta(days=np.random.randint(0,180))
    quantity_sold = np.random.randint(1,20)
    unit_price = int(inv_row['unit_price'])
    promotion_flag = np.random.rand()<0.2
    sales_list.append([date.date(), sku, quantity_sold, unit_price, wh, promotion_flag])
sales = pd.DataFrame(sales_list, columns=['order_date', 'sku_id', 'quantity_sold', 'unit_price', 'warehouse_id', 'promotion_flag'])
sales.to_csv(output_dir + "sales.csv", index=False)