In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [3]:
# Configuración
fake = Faker()
np.random.seed(42)
random.seed(42)

# Parámetros
num_customers = 14645
num_days = 1800
start_date = datetime(2020, 1, 1)

# Estados (regiones) con distribución desigual
regions = {
    "California": 1.3,
    "Texas": 1.2,
    "New York": 1.0,
    "Florida": 1.1,
    "Wisconsin": 0.8,
    "Colorado": 0.6,
    "Alabama": 0.5
}

# Métodos de pago
payment_methods = ["Cash", "Credit Card", "Debit Card", "Mobile App"]

# Categorías y productos
categories = {
    "Beverages": ["Apple Juice", "Orange Soda", "Iced Tea", "Mineral Water"],
    "Bakery": ["Bread Loaf", "Croissant", "Muffin", "Bagel"],
    "Deli": ["Chicken Wrap", "Ham Sandwich", "Turkey Roll", "Veggie Wrap"],
    "Snacks": ["Potato Chips", "Granola Bar", "Chocolate Cookie", "Popcorn"],
    "Dairy": ["Milk", "Cheese Block", "Yogurt", "Butter"],
    "Frozen Foods": ["Frozen Pizza", "Ice Cream", "Frozen Vegetables", "Frozen Fries"],
    "Produce": ["Banana", "Apple", "Carrot", "Lettuce"],
    "Meat": ["Chicken Breast", "Ground Beef", "Pork Chops", "Bacon"],
    "Seafood": ["Salmon Fillet", "Shrimp Pack", "Tuna Can", "Crab Meat"],
    "Pantry": ["Pasta", "Rice", "Canned Beans", "Tomato Sauce"],
    "Condiments": ["Ketchup", "Mustard", "Soy Sauce", "Mayonnaise"],
    "Cereal": ["Corn Flakes", "Granola", "Oats", "Choco Cereal"],
    "Cleaning Supplies": ["Dish Soap", "Bleach", "Glass Cleaner", "Sponges"],
    "Personal Care": ["Toothpaste", "Shampoo", "Soap", "Deodorant"],
    "Baby Products": ["Diapers", "Baby Wipes", "Baby Lotion", "Formula"],
    "Pet Supplies": ["Dog Food", "Cat Litter", "Pet Shampoo", "Treats"],
    "Baking": ["Flour", "Sugar", "Yeast", "Baking Powder"],
    "Breakfast": ["Pancake Mix", "Maple Syrup", "Instant Coffee", "Tea Bags"],
    "International": ["Tortillas", "Noodles", "Salsa Verde", "Curry Paste"],
    "Health": ["Vitamins", "Protein Bar", "Electrolyte Drink", "Energy Gel"]
}

# Productos
products = []
product_id = 1
for category, items in categories.items():
    for item in items:
        price = round(np.random.uniform(1.0, 5.0), 2)
        products.append({
            "product_id": product_id,
            "name": item,
            "category": category,
            "price": price
        })
        product_id += 1
df_products = pd.DataFrame(products)

# Clientes
customers = []
for i in range(1, num_customers + 1):
    region = random.choices(list(regions.keys()), weights=list(regions.values()), k=1)[0]
    customers.append({
        "customer_id": i,
        "name": fake.name(),
        "region": region,
        "customer_type": random.choices(["Regular", "New", "Loyal"], weights=[0.5, 0.3, 0.2])[0]
    })
df_customers = pd.DataFrame(customers)

# Ventas
sales = []
sale_id = 1
store_id = 1

for day in range(num_days):
    date = start_date + timedelta(days=day)
    for region, weight in regions.items():
        growth_factor = 1 + (day / num_days) * 0.4
        base_sales = int(np.random.poisson(lam=1456 * weight) * growth_factor)
        if date.weekday() >= 5:
            base_sales += np.random.randint(5, 15)

        for _ in range(base_sales):
            product = df_products.sample(1).iloc[0]
            customer = df_customers[df_customers["region"] == region].sample(1).iloc[0]
            quantity = np.random.randint(1, 4)

            # Ajustes según región
            if region == "Wisconsin" and product["category"] == "Dairy":
                quantity += 1
            if region == "Florida" and product["category"] == "Beverages":
                quantity += 1
            if customer["customer_type"] == "Loyal":
                quantity += 1

            payment_method = random.choices(
                payment_methods,
                weights=[0.2, 0.4, 0.2, 0.2] if customer["customer_type"] == "Loyal" else [0.4, 0.3, 0.2, 0.1],
                k=1
            )[0]

            sales.append({
                "sale_id": sale_id,
                "product_id": product["product_id"],
                "customer_id": customer["customer_id"],
                "store_id": store_id,
                "region": region,
                "date": date,
                "quantity": quantity,
                "payment_method": payment_method
            })
            sale_id += 1

df_sales = pd.DataFrame(sales)
df_sales = df_sales.merge(df_products[["product_id", "price"]], on="product_id")
df_sales["total"] = df_sales["quantity"] * df_sales["price"]

# Guardar
df_products.to_csv("products.csv", index=False)
df_customers.to_csv("customers.csv", index=False)
df_sales.to_csv("sales.csv", index=False)

print("✅ Archivos generados con regiones y métodos de pago: products.csv, customers.csv, sales.csv")




KeyboardInterrupt



In [None]:
df_products = pd.read_csv('products.csv')
df_customers = pd.read_csv('customers.csv')
df_sales = pd.read_csv('sales.csv')

In [None]:
len(df_products)

In [None]:
len(df_customers)

In [None]:
len(df_sales)

In [None]:
df_sales