In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Parameters
n_carts = 10000
mean_items_per_cart = 4
start_date = pd.Timestamp("2025-01-01")

cart_list = []

# Function to compute sale probability based on price
def sale_probability(price):
    # Normalize price to 0-1 scale (assume max price ~150)
    prob = min(price / 150, 1)
    return prob * 0.8  # Max 80% chance

for cart_idx, cart_id in enumerate(range(1000, 1000 + n_carts)):
    customer_id = np.random.randint(1, 500)
    # Increment date every ~20 carts to spread dates over time
    date = start_date + pd.Timedelta(days=cart_idx // 20)
    
    # Poisson distributed number of items per cart
    n_products = np.random.poisson(mean_items_per_cart)
    if n_products == 0:  # Ensure at least 1 item
        n_products = 1
    
    for _ in range(n_products):
        # Poisson distributed base quantity (mean ~1)
        base_quantity = np.random.poisson(1) + 1  # ensure >=1
        
        regular_price = round(np.random.uniform(5, 150), 2)
        
        # Sale probability depends on price
        if np.random.rand() < sale_probability(regular_price):
            discount = np.random.uniform(0.1, 0.6)
            sale_price = round(regular_price * (1 - discount), 2)
            # Increase quantity slightly if on sale
            quantity = base_quantity + np.random.poisson(1)  # more likely to buy more
        else:
            sale_price = np.nan
            quantity = base_quantity
        
        cart_list.append({
            "cart_id": cart_id,
            "customer_id": customer_id,
            "product_id": np.random.randint(100, 500),
            "quantity": quantity,
            "date": date,
            "regular_price": regular_price,
            "sale_price": sale_price
        })

# Create DataFrame
df = pd.DataFrame(cart_list)

# Introduce 0.1% errors
n_errors = max(1, int(0.001 * len(df)))
for _ in range(n_errors):
    row_idx = np.random.randint(0, len(df))
    col_idx = np.random.choice([1,2,3,5,6])
    if df.columns[col_idx] in ["customer_id","product_id","quantity"]:
        df.iat[row_idx, col_idx] = -999
    else:
        df.iat[row_idx, col_idx] = np.nan

# Export CSV
df.to_csv("sales_by_carts.csv", index=False)
print("Dataset created with Poisson-distributed cart sizes and sale likelihood based on price.")


Dataset created with Poisson-distributed cart sizes and sale likelihood based on price.
