In [1]:
import pandas as pd
import numpy as np
import os
import random
from datetime import datetime, timedelta

In [2]:
NUM_CUSTOMERS = 100_000    # Number of customers
NUM_PRODUCTS = 10_000      # Number of products
NUM_ORDERS = 1_000_000     # Number of orders
OUTPUT_DIR = "./data/input"     # Where CSVs will be saved
START_DATE = "2024-01-01"
END_DATE = "2025-01-01"

In [3]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# Generate Customers

regions = ["East", "West", "North", "South"]
customers = pd.DataFrame({
    "customer_id": np.arange(1,NUM_CUSTOMERS+1),
    "name": [f"Customer_{i}" for i in range(1, NUM_CUSTOMERS+1)],
    "region": np.random.choice(regions, NUM_CUSTOMERS)
})
customers.to_csv(f"{OUTPUT_DIR}/customers.csv", index=False)
print(f"Generated customers.csv with {len(customers)} rows")

Generated customers.csv with 100000 rows


In [7]:
# Generate Products

categories = ["Smartphones", "Laptops", "Headphones", "Gaming Consoles", "Home Decor", "Books", "Fitness Gear", "Kitchen Appliances"]
brands = ["Zenith", "Aurelia", "NovaTech", "Kronos", "Lumina", "Solara", "Vertex", "Eclipse"]


products = pd.DataFrame({
    "product_id": np.arange(1, NUM_PRODUCTS+1),
    "product_name": [f"{random.choice(brands)} {random.choice(['X', 'Pro', 'Plus', 'Max', 'Elite'])} {i}" for i in range(1, NUM_PRODUCTS+1)],
    "category": np.random.choice(categories, NUM_PRODUCTS),
    "price": np.round(np.random.uniform(50, 2000, NUM_PRODUCTS), 2)
})

products.to_csv(f"{OUTPUT_DIR}/products.csv", index=False)
print(f"Generated products.csv with {len(products)} rows")

Generated products.csv with 10000 rows


In [9]:
# Generate Orders

start_dt = datetime.strptime(START_DATE, "%Y-%m-%d")
end_dt = datetime.strptime(END_DATE, "%Y-%m-%d")
date_range = (end_dt - start_dt).days

order_ids = np.arange(1, NUM_ORDERS+1)
customer_ids = np.random.randint(1, NUM_CUSTOMERS + 1, size=NUM_ORDERS)
product_ids = np.random.randint(1, NUM_PRODUCTS + 1, size= NUM_ORDERS)
quantities = np.random.randint(1, 6, size= NUM_ORDERS)
order_dates = [start_dt+ timedelta(days=int(x)) for x in np.random.randint(0, date_range + 1, NUM_ORDERS)]


orders = pd.DataFrame({
    "order_id": order_ids,
    "customer_id": customer_ids,
    "product_id": product_ids,
    "quantity": quantities,
    "order_date": [d.strftime("%Y-%m-%d") for d in order_dates]
})

orders.to_csv(f"{OUTPUT_DIR}/orders.csv", index=False)
print(f"Generated orders.csv with {len(orders)} rows")

Generated orders.csv with 1000000 rows
