In [1]:
# 1.1 Basic imports
import pandas as pd
import numpy as np
import datetime as dt
import random

In [2]:
# Ensure reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [3]:
# 2.1 Desired dataset size
NUM_ROWS = 100_000  # as specified

# 2.2 Demographics to include
DEMOGRAPHIC_COLUMNS = ['IncomeTier', 'Region', 'AgeGroup', 'Gender']

# 2.3 Custom period: specify your start and end dates here
# Example placeholder; change to your needed custom period
START_DATE = '2020-01-01'
END_DATE   = '2024-12-31'

# 2.4 Profit margin breakdown dimensions you want modeled
MARGIN_BREAKDOWNS = ['Channel', 'TimePeriod', 'Region', 'SKU']

In [4]:
# 3.1.1 Generate monthly periods
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='MS')  # first of month
len(date_range)

60

In [5]:
regions = ['Northeast', 'Midwest', 'South', 'West', 'International']

In [6]:
channels = ['Grocery', 'Convenience', 'Online', 'Club', 'Foodservice']

In [7]:
skus = [
    'Chocolate', 'Vanilla', 'Strawberry', 'Mint', 'CookieDough',
    'NonDairy_Chocolate', 'LowCal_Protein_Scoop'
]

In [8]:
income_tiers = ['Low', 'Lower-Middle', 'Upper-Middle', 'High']
age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
genders = ['Female', 'Male', 'Other/Non-binary']

In [9]:
def simulate_base_sales(date, region, channel, sku):
    """
    Return a base sales figure for the record, before random noise.
    This is illustrative; you can tune multipliers or use actual historical data.
    """

    # Time trend: simulate some seasonality, growth/decline
    month = date.month
    # Example seasonality: peak in summer months
    seasonality = 1 + 0.2 * np.sin((month - 1) / 12 * 2 * np.pi)

    # Region multiplier
    region_mult = {
        'Northeast': 1.0,
        'Midwest': 0.9,
        'South': 1.1,
        'West': 1.2,
        'International': 0.5
    }.get(region, 1.0)

    # Channel multiplier
    channel_mult = {
        'Grocery': 1.0,
        'Convenience': 0.7,
        'Online': 0.4,
        'Club': 0.8,
        'Foodservice': 0.2
    }.get(channel, 1.0)

    # SKU popularity multiplier
    sku_pop = {
        'Chocolate': 1.2,
        'Vanilla':   1.0,
        'Strawberry': 0.8,
        'Mint': 0.6,
        'CookieDough': 1.1,
        'NonDairy_Chocolate': 0.5,
        'LowCal_Protein_Scoop': 0.7
    }.get(sku, 1.0)

    # Base scale value
    base = 1000  # adjust scale as desired

    sales = base * seasonality * region_mult * channel_mult * sku_pop
    return sales

In [10]:
def simulate_price_and_margin(sku, channel):
    """
    Rough synthetic logic: price varies by SKU and channel; margin varies.
    Outputs unit_price and margin_percentage.
    """
    # Base price by SKU
    sku_price = {
        'Chocolate': 6.0,
        'Vanilla': 5.5,
        'Strawberry': 5.5,
        'Mint': 5.5,
        'CookieDough': 6.5,
        'NonDairy_Chocolate': 7.0,
        'LowCal_Protein_Scoop': 6.5
    }.get(sku, 6.0)

    # Channel price modifier
    channel_price_mod = {
        'Grocery': 1.0,
        'Convenience': 1.1,
        'Online': 1.0,
        'Club': 0.9,
        'Foodservice': 0.8
    }.get(channel, 1.0)

    unit_price = sku_price * channel_price_mod

    # Margin base by channel (percentage)
    channel_margin_base = {
        'Grocery': 0.30,
        'Convenience': 0.25,
        'Online': 0.35,
        'Club': 0.28,
        'Foodservice': 0.20
    }.get(channel, 0.30)

    # SKU effect on margin
    sku_margin_effect = {
        'Chocolate': 0.0,
        'Vanilla': 0.0,
        'Strawberry': -0.02,
        'Mint': -0.03,
        'CookieDough': +0.02,
        'NonDairy_Chocolate': +0.03,
        'LowCal_Protein_Scoop': +0.01
    }.get(sku, 0.0)

    margin_pct = channel_margin_base + sku_margin_effect

    # sanity clamp
    margin_pct = max(0.05, min(margin_pct, 0.6))
    return unit_price, margin_pct

In [11]:
records = []

for _ in range(NUM_ROWS):
    # Random pick time, region, channel, sku
    date = random.choice(date_range)
    region = random.choice(regions)
    channel = random.choice(channels)
    sku = random.choice(skus)

    # Demographics
    income = random.choice(income_tiers)
    age = random.choice(age_groups)
    gender = random.choice(genders)

    # Simulate base sales and add noise
    base_sales = simulate_base_sales(date, region, channel, sku)
    # Add random noise +/- 20%
    sales = base_sales * np.random.uniform(0.8, 1.2)

    # Determine unit price and margin
    unit_price, margin_pct = simulate_price_and_margin(sku, channel)
    # Compute revenue and profit
    revenue = sales * unit_price
    profit = revenue * margin_pct

    records.append({
        'Date': date,
        'Region': region,
        'Channel': channel,
        'SKU': sku,
        'IncomeTier': income,
        'AgeGroup': age,
        'Gender': gender,
        'UnitsSold': sales,
        'UnitPrice': unit_price,
        'Revenue': revenue,
        'MarginPct': margin_pct,
        'Profit': profit
    })

# Create DataFrame
df = pd.DataFrame(records)
df.head()

Unnamed: 0,Date,Region,Channel,SKU,IncomeTier,AgeGroup,Gender,UnitsSold,UnitPrice,Revenue,MarginPct,Profit
0,2023-05-01,Northeast,Grocery,NonDairy_Chocolate,Upper-Middle,25-34,Female,557.164506,7.0,3900.151545,0.33,1287.05001
1,2020-09-01,Northeast,Foodservice,Chocolate,High,18-24,Female,234.205017,4.8,1124.184083,0.2,224.836817
2,2020-06-01,Midwest,Convenience,CookieDough,Low,55-64,Female,833.039593,7.15,5956.233088,0.27,1608.182934
3,2023-10-01,International,Club,Vanilla,High,55-64,Male,332.628286,4.95,1646.510016,0.28,461.022804
4,2024-04-01,Northeast,Convenience,NonDairy_Chocolate,High,35-44,Male,362.211132,7.7,2789.025713,0.28,780.9272


In [12]:
df['YearMonth'] = df['Date'].dt.to_period('M').astype(str)

In [13]:
monthly_summary = df.groupby('YearMonth').agg({
    'UnitsSold': 'sum',
    'Revenue': 'sum',
    'Profit': 'sum'
}).reset_index()
monthly_summary.head()

Unnamed: 0,YearMonth,UnitsSold,Revenue,Profit
0,2020-01,814058.2,4809986.0,1377007.0
1,2020-02,919894.4,5471172.0,1564613.0
2,2020-03,963508.0,5738972.0,1644464.0
3,2020-04,1017506.0,6020270.0,1719451.0
4,2020-05,946641.7,5623927.0,1614046.0


In [14]:
df['Region'].value_counts(normalize=True)
df['Channel'].value_counts(normalize=True)

Channel
Grocery        0.20099
Club           0.20011
Convenience    0.20001
Foodservice    0.19965
Online         0.19924
Name: proportion, dtype: float64

In [15]:
total_revenue = df['Revenue'].sum()
total_profit = df['Profit'].sum()
total_units = df['UnitsSold'].sum()

print(f"Total revenue: ${total_revenue:,.2f}")
print(f"Total profit: ${total_profit:,.2f}")
print(f"Total units sold: {total_units:,.0f}")

Total revenue: $292,608,660.92
Total profit: $83,978,011.42
Total units sold: 49,282,473


In [16]:
# Save a CSV
df.to_csv('halo_top_synthetic_sales_100k.csv', index=False)

# Optionally save a sample to preview
df.sample(1000).to_csv('halo_top_sample_1k.csv', index=False)