In [2]:
%pip install faker
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.9 MB[0m [31m23.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.6.0


In [3]:
fake = Faker()

# Parameters
num_customers = 200
num_products = 100
num_orders = 5000


In [4]:
# --- Customers Table ---
segments = ["Consumer", "Corporate", "Small Business"]
regions = ["North", "South", "East", "West"]

customers = []
for i in range(1, num_customers+1):
    customers.append([
        i,
        fake.name(),
        random.choice(segments),
        random.choice(regions)
    ])

customers_df = pd.DataFrame(customers, columns=["CustomerID", "Name", "Segment", "Region"])

In [5]:
# --- Products Table ---
categories = {
    "Electronics": ["Laptop", "Phone", "Tablet", "Monitor"],
    "Furniture": ["Chair", "Desk", "Cabinet", "Couch"],
    "Office Supplies": ["Pen", "Notebook", "Paper", "Stapler"],
    "Clothing": ["Shirt", "Jeans", "Jacket", "Shoes"]
}

products = []
for i in range(1, num_products+1):
    category = random.choice(list(categories.keys()))
    subcategory = random.choice(categories[category])
    price = round(random.uniform(10, 2000), 2)
    products.append([i, category, subcategory, price])

products_df = pd.DataFrame(products, columns=["ProductID", "Category", "SubCategory", "Price"])

In [6]:
# --- Orders Table ---
orders = []
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)

for i in range(1, num_orders+1):
    cust_id = random.randint(1, num_customers)
    prod_id = random.randint(1, num_products)
    order_date = start_date + timedelta(days=random.randint(0, (end_date-start_date).days))
    quantity = random.randint(1, 5)
    discount = round(random.uniform(0, 0.3), 2)
    price = products_df.loc[products_df["ProductID"] == prod_id, "Price"].values[0]
    sales = round(quantity * price * (1 - discount), 2)
    profit = round(sales * random.uniform(0.05, 0.3), 2)
    orders.append([i, cust_id, prod_id, order_date, quantity, discount, sales, profit])

orders_df = pd.DataFrame(orders, columns=["OrderID", "CustomerID", "ProductID", "OrderDate", "Quantity", "Discount", "Sales", "Profit"])

In [7]:
# --- Save to CSV ---
customers_df.to_csv("customers.csv", index=False)
products_df.to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)

print("Synthetic dataset generated: customers.csv, products.csv, orders.csv")

Synthetic dataset generated: customers.csv, products.csv, orders.csv
