In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import datetime as dt
import random

# Set up faker
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate customer data
def generate_customers(num_customers=1000):
    customers = []
    for i in range(num_customers):
        customers.append({
            'customer_id': i,
            'name': fake.name(),
            'email': fake.email(),
            'address': fake.address(),
            'phone': fake.phone_number(),
            'registration_date': fake.date_between(start_date='-2y', end_date='today')
        })
    return pd.DataFrame(customers)

# Generate product data
def generate_products(num_products=100):
    categories = ['Electronics', 'Clothing', 'Home & Kitchen', 'Books', 'Sports']
    products = []
    for i in range(num_products):
        category = random.choice(categories)
        products.append({
            'product_id': i,
            'name': fake.bs(),
            'category': category,
            'price': round(random.uniform(10, 1000), 2),
            'stock': random.randint(0, 1000)
        })
    return pd.DataFrame(products)

# Generate order data with time series component
def generate_orders(num_orders=5000, num_customers=1000, num_products=100):
    orders = []
    order_items = []
    
    # Create a seasonal pattern with higher sales in Nov-Dec
    start_date = dt.datetime.now() - dt.timedelta(days=365)
    
    for i in range(num_orders):
        # Create seasonal effect
        days_ago = random.gammavariate(1.5, 100) % 365  # More recent orders more likely
        order_date = dt.datetime.now() - dt.timedelta(days=days_ago)
        month = order_date.month
        
        # Seasonal adjustment - more orders during holidays
        if month in [11, 12]:  # Nov, Dec
            if random.random() > 0.3:  # 70% chance to keep order in these months
                continue
                
        customer_id = random.randint(0, num_customers-1)
        
        order_id = i
        orders.append({
            'order_id': order_id,
            'customer_id': customer_id,
            'order_date': order_date,
            'status': random.choice(['Completed', 'Shipped', 'Processing', 'Cancelled']),
            'total': 0  # Will update after generating items
        })
        
        # Generate 1-5 items per order
        num_items = random.randint(1, 5)
        order_total = 0
        
        for j in range(num_items):
            product_id = random.randint(0, num_products-1)
            quantity = random.randint(1, 3)
            price = round(random.uniform(10, 1000), 2)  # We'll join with actual prices later
            item_total = price * quantity
            order_total += item_total
            
            order_items.append({
                'order_id': order_id,
                'product_id': product_id,
                'quantity': quantity,
                'price': price,
                'total': item_total
            })
        
        # Update order total
        orders[-1]['total'] = round(order_total, 2)
            
    return pd.DataFrame(orders), pd.DataFrame(order_items)

# Generate the data
customers_df = generate_customers()
products_df = generate_products()
orders_df, order_items_df = generate_orders()

# Save to CSV files (our data source)
customers_df.to_csv('customers.csv', index=False)
products_df.to_csv('products.csv', index=False)
orders_df.to_csv('orders.csv', index=False)
order_items_df.to_csv('order_items.csv', index=False)

print(f"Generated data: {len(customers_df)} customers, {len(products_df)} products, {len(orders_df)} orders, {len(order_items_df)} order items")

Generated data: 1000 customers, 100 products, 4204 orders, 12547 order items
