In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [6]:
# Configuración
fake = Faker()
np.random.seed(42)
random.seed(42)

# Parámetros
num_customers = 12
num_days = 18
start_date = datetime(2020, 1, 1)

# Estados (regiones) con distribución desigual
regions = {
    "California": 1.3,
    "Texas": 1.2,
    "New York": 1.0,
    "Florida": 1.1,
    "Wisconsin": 0.8,
    "Colorado": 0.6,
    "Alabama": 0.5
}

# Métodos de pago
payment_methods = ["Cash", "Credit Card", "Debit Card", "Mobile App"]

# Categorías y productos
categories = {
    "Beverages": ["Apple Juice", "Orange Soda", "Iced Tea", "Mineral Water"],
    "Bakery": ["Bread Loaf", "Croissant", "Muffin", "Bagel"],
    "Deli": ["Chicken Wrap", "Ham Sandwich", "Turkey Roll", "Veggie Wrap"],
    "Snacks": ["Potato Chips", "Granola Bar", "Chocolate Cookie", "Popcorn"],
    "Dairy": ["Milk", "Cheese Block", "Yogurt", "Butter"],
    "Frozen Foods": ["Frozen Pizza", "Ice Cream", "Frozen Vegetables", "Frozen Fries"],
    "Produce": ["Banana", "Apple", "Carrot", "Lettuce"],
    "Meat": ["Chicken Breast", "Ground Beef", "Pork Chops", "Bacon"],
    "Seafood": ["Salmon Fillet", "Shrimp Pack", "Tuna Can", "Crab Meat"],
    "Pantry": ["Pasta", "Rice", "Canned Beans", "Tomato Sauce"]

}

# Productos
products = []
product_id = 1
for category, items in categories.items():
    for item in items:
        price = round(np.random.uniform(1.0, 5.0), 2)
        products.append({
            "product_id": product_id,
            "name": item,
            "category": category,
            "price": price
        })
        product_id += 1
df_products = pd.DataFrame(products)

# Clientes
customers = []
for i in range(1, num_customers + 1):
    region = random.choices(list(regions.keys()), weights=list(regions.values()), k=1)[0]
    customers.append({
        "customer_id": i,
        "name": fake.name(),
        "region": region,
        "customer_type": random.choices(["Regular", "New", "Loyal"], weights=[0.5, 0.3, 0.2])[0]
    })
df_customers = pd.DataFrame(customers)

# Ventas
sales = []
sale_id = 1
store_id = 1

for day in range(num_days):
    date = start_date + timedelta(days=day)
    for region, weight in regions.items():
        growth_factor = 1 + (day / num_days) * 0.4
        base_sales = int(np.random.poisson(lam=1456 * weight) * growth_factor)
        if date.weekday() >= 5:
            base_sales += np.random.randint(5, 15)

        for _ in range(base_sales):
            product = df_products.sample(1).iloc[0]
            customer = df_customers[df_customers["region"] == region].sample(1).iloc[0]
            quantity = np.random.randint(1, 4)

            # Ajustes según región
            if region == "Wisconsin" and product["category"] == "Dairy":
                quantity += 1
            if region == "Florida" and product["category"] == "Beverages":
                quantity += 1
            if customer["customer_type"] == "Loyal":
                quantity += 1

            payment_method = random.choices(
                payment_methods,
                weights=[0.2, 0.4, 0.2, 0.2] if customer["customer_type"] == "Loyal" else [0.4, 0.3, 0.2, 0.1],
                k=1
            )[0]

            sales.append({
                "sale_id": sale_id,
                "product_id": product["product_id"],
                "customer_id": customer["customer_id"],
                "store_id": store_id,
                "region": region,
                "date": date,
                "quantity": quantity,
                "payment_method": payment_method
            })
            sale_id += 1

df_sales = pd.DataFrame(sales)
df_sales = df_sales.merge(df_products[["product_id", "price"]], on="product_id")
df_sales["total"] = df_sales["quantity"] * df_sales["price"]

# Guardar
df_products.to_csv("products.csv", index=False)
df_customers.to_csv("customers.csv", index=False)
df_sales.to_csv("sales.csv", index=False)

print("✅ Archivos generados con regiones y métodos de pago: products.csv, customers.csv, sales.csv")



ValueError: a must be greater than 0 unless no samples are taken

In [26]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Configuración
fake = Faker()
np.random.seed(42)
random.seed(42)

# Parámetros
num_customers = 19440
num_days = 1800
start_date = datetime(2024, 10, 1)

# Estados (regiones) con distribución desigual
regions = {
    "California": 1.3,
    "Texas": 1.2,
    "New York": 1.0,
    "Florida": 1.1,
    "Wisconsin": 0.8,
    "Colorado": 0.6,
    "Alabama": 0.5
}

# Métodos de pago
payment_methods = ["Cash", "Credit Card", "Debit Card", "Mobile App"]

# Categorías y productos
categories = {
    "Beverages": ["Apple Juice", "Orange Soda", "Iced Tea", "Mineral Water"],
    "Bakery": ["Bread Loaf", "Croissant", "Muffin", "Bagel"],
    "Deli": ["Chicken Wrap", "Ham Sandwich", "Turkey Roll", "Veggie Wrap"],
    "Snacks": ["Potato Chips", "Granola Bar", "Chocolate Cookie", "Popcorn"],
    "Dairy": ["Milk", "Cheese Block", "Yogurt", "Butter"],
    "Frozen Foods": ["Frozen Pizza", "Ice Cream", "Frozen Vegetables", "Frozen Fries"],
    "Produce": ["Banana", "Apple", "Carrot", "Lettuce"],
    "Meat": ["Chicken Breast", "Ground Beef", "Pork Chops", "Bacon"],
    "Seafood": ["Salmon Fillet", "Shrimp Pack", "Tuna Can", "Crab Meat"],
    "Pantry": ["Pasta", "Rice", "Canned Beans", "Tomato Sauce"]

}

# Productos
products = []
product_id = 1
for category, items in categories.items():
    for item in items:
        price = round(np.random.uniform(1.0, 5.0), 2)
        products.append({
            "product_id": product_id,
            "name": item,
            "category": category,
            "price": price
        })
        product_id += 1
df_products = pd.DataFrame(products)

# Clientes
customers = []
for i in range(1, num_customers + 1):
    region = random.choices(list(regions.keys()), weights=list(regions.values()), k=1)[0]
    customers.append({
        "customer_id": i,
        "name": fake.name(),
        "region": region,
        "customer_type": random.choices(["Regular", "New", "Loyal"], weights=[0.5, 0.3, 0.2])[0]
    })
df_customers = pd.DataFrame(customers)

# Ventas
sales = []
sale_id = 1
store_id = 1

for day in range(num_days):
    date = start_date + timedelta(days=day)
    for region, weight in regions.items():
        growth_factor = 1 + (day / num_days) * 0.4
        base_sales = int(np.random.poisson(lam=10 * weight) * growth_factor)
        if date.weekday() >= 5:
            base_sales += np.random.randint(5, 15)

        for _ in range(base_sales):
            product = df_products.sample(1).iloc[0]
            customer = df_customers[df_customers["region"] == region].sample(1).iloc[0]
            quantity = np.random.randint(1, 4)

            # Ajustes según región
            if region == "Wisconsin" and product["category"] == "Dairy":
                quantity += 1
            if region == "Florida" and product["category"] == "Beverages":
                quantity += 1
            if customer["customer_type"] == "Loyal":
                quantity += 1

            payment_method = random.choices(
                payment_methods,
                weights=[0.2, 0.4, 0.2, 0.2] if customer["customer_type"] == "Loyal" else [0.4, 0.3, 0.2, 0.1],
                k=1
            )[0]

            sales.append({
                "sale_id": sale_id,
                "product_id": product["product_id"],
                "customer_id": customer["customer_id"],
                "store_id": store_id,
                "region": region,
                "date": date,
                "quantity": quantity,
                "payment_method": payment_method
            })
            sale_id += 1

df_sales = pd.DataFrame(sales)
df_sales = df_sales.merge(df_products[["product_id", "price"]], on="product_id")
df_sales["total"] = df_sales["quantity"] * df_sales["price"]

# Guardar
df_products.to_csv("products.csv", index=False)
df_customers.to_csv("customers.csv", index=False)
df_sales.to_csv("sales.csv", index=False)

print("✅ Archivos generados con regiones y métodos de pago: products.csv, customers.csv, sales.csv")


✅ Archivos generados con regiones y métodos de pago: products.csv, customers.csv, sales.csv


In [27]:
df_products = pd.read_csv('products.csv')
df_customers = pd.read_csv('customers.csv')
df_sales = pd.read_csv('sales.csv')

In [28]:
len(df_products)

40

In [29]:
len(df_customers)

19440

In [30]:
len(df_sales)

168025

In [31]:
df_sales

Unnamed: 0,sale_id,product_id,customer_id,store_id,region,date,quantity,payment_method,price,total
0,1,30,11313,1,California,2024-10-01,4,Cash,1.19,4.76
1,45,30,8208,1,Wisconsin,2024-10-01,3,Cash,1.19,3.57
2,173,30,11189,1,Florida,2024-10-03,2,Cash,1.19,2.38
3,263,30,11796,1,California,2024-10-05,2,Cash,1.19,2.38
4,288,30,6511,1,Texas,2024-10-05,1,Credit Card,1.19,1.19
...,...,...,...,...,...,...,...,...,...,...
168020,167877,40,15696,1,California,2029-09-03,1,Mobile App,2.76,2.76
168021,167891,40,15909,1,New York,2029-09-03,1,Credit Card,2.76,2.76
168022,167950,40,12080,1,Texas,2029-09-04,4,Mobile App,2.76,11.04
168023,167960,40,12145,1,New York,2029-09-04,1,Credit Card,2.76,2.76
