In [1]:
# cell 1
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# cell 2

# Option flags
USE_LAST_YEAR = True
USE_LAST_5_YEARS = False

# Today
today = datetime.today()

if USE_LAST_YEAR:
    start_date = datetime(today.year - 1, 1, 1)
    end_date = datetime(today.year - 1, 12, 31)
elif USE_LAST_5_YEARS:
    start_date = datetime(today.year - 5, 1, 1)
    end_date = datetime(today.year - 1, 12, 31)
else:
    raise ValueError("Set one of USE_LAST_YEAR or USE_LAST_5_YEARS to True.")

In [3]:
# cell 3

def generate_dates(start, end, freq='M'):
    return pd.date_range(start=start, end=end, freq=freq)

dates = generate_dates(start_date, end_date)

# Example products
products = [
    "Good Humor Classic Bar",
    "Good Humor Strawberry Shortcake Bar",
    "Good Humor Chocolate Eclair Bar",
]

# Regions for demo
regions = ["Northeast", "Midwest", "South", "West"]

# Age groups for demo
age_groups = ["<18", "18-34", "35-54", "55+"]

# Gender categories for demo
genders = ["Male", "Female", "Non-binary", "Prefer not to say"]

rows = []
for date in dates:
    for product in products:
        for region in regions:
            # Synthetic units sold
            base_units = random.randint(500, 2000)
            # add some seasonality: summer months higher
            if date.month in [6, 7, 8]:
                base_units = int(base_units * 1.3)
            units_sold = base_units

            # Synthetic pricing
            price_per_unit = round(random.uniform(1.50, 3.00), 2)  # dollars
            revenue = round(units_sold * price_per_unit, 2)

            # Synthetic cost, e.g., 60-80% of revenue
            cost_ratio = random.uniform(0.60, 0.80)
            cost = round(revenue * cost_ratio, 2)

            profit = round(revenue - cost, 2)
            profit_margin = round(profit / revenue if revenue else 0, 4)

            # Demographic distribution of units: random split but summing to units_sold
            # Age group split
            age_dist = np.random.dirichlet(np.ones(len(age_groups)), size=1)[0]
            # Gender split
            gender_dist = np.random.dirichlet(np.ones(len(genders)), size=1)[0]

            for ag, ag_frac in zip(age_groups, age_dist):
                for g, g_frac in zip(genders, gender_dist):
                    # units for this demo subgroup
                    subgroup_units = int(round(units_sold * ag_frac * g_frac))
                    if subgroup_units == 0:
                        continue
                    rows.append({
                        "date": date,
                        "product": product,
                        "region": region,
                        "units_sold": subgroup_units,
                        "price_per_unit": price_per_unit,
                        "revenue": round(subgroup_units * price_per_unit, 2),
                        "cost": round(subgroup_units * price_per_unit * cost_ratio, 2),
                        "profit": round(subgroup_units * price_per_unit * (1 - cost_ratio), 2),
                        "profit_margin": profit_margin,
                        "age_group": ag,
                        "gender": g,
                    })

df = pd.DataFrame(rows)
df.head()

  return pd.date_range(start=start, end=end, freq=freq)


Unnamed: 0,date,product,region,units_sold,price_per_unit,revenue,cost,profit,profit_margin,age_group,gender
0,2025-01-31,Good Humor Classic Bar,Northeast,65,2.73,177.45,119.37,58.08,0.3273,<18,Male
1,2025-01-31,Good Humor Classic Bar,Northeast,3,2.73,8.19,5.51,2.68,0.3273,<18,Female
2,2025-01-31,Good Humor Classic Bar,Northeast,87,2.73,237.51,159.78,77.73,0.3273,<18,Non-binary
3,2025-01-31,Good Humor Classic Bar,Northeast,196,2.73,535.08,359.96,175.12,0.3273,<18,Prefer not to say
4,2025-01-31,Good Humor Classic Bar,Northeast,66,2.73,180.18,121.21,58.97,0.3273,18-34,Male


In [4]:
# cell 4

monthly_summary = df.groupby(
    ["date", "product", "region"], as_index=False
).agg({
    "units_sold": "sum",
    "revenue": "sum",
    "cost": "sum",
    "profit": "sum",
})

# compute weighted profit margin
monthly_summary["profit_margin"] = (
    monthly_summary["profit"] / monthly_summary["revenue"]
).round(4)

monthly_summary.head()

Unnamed: 0,date,product,region,units_sold,revenue,cost,profit,profit_margin
0,2025-01-31,Good Humor Chocolate Eclair Bar,Midwest,1920,4646.4,3147.81,1498.59,0.3225
1,2025-01-31,Good Humor Chocolate Eclair Bar,Northeast,1066,2014.74,1339.53,675.21,0.3351
2,2025-01-31,Good Humor Chocolate Eclair Bar,South,830,2025.2,1327.22,697.98,0.3446
3,2025-01-31,Good Humor Chocolate Eclair Bar,West,641,1487.12,1150.8,336.32,0.2262
4,2025-01-31,Good Humor Classic Bar,Midwest,1227,3398.79,2522.93,875.86,0.2577


In [5]:
# cell 5

# Filter to a single year if needed
# Example: last year only
if USE_LAST_YEAR:
    filter_start = datetime(today.year - 1, 1, 1)
    filter_end = datetime(today.year - 1, 12, 31)
else:
    filter_start = start_date
    filter_end = end_date

mask = (df["date"] >= filter_start) & (df["date"] <= filter_end)
df_period = df.loc[mask]

# Aggregate by age group
age_summary = df_period.groupby("age_group", as_index=False).agg({
    "units_sold": "sum",
    "revenue": "sum",
    "cost": "sum",
    "profit": "sum"
})
age_summary["profit_margin"] = (age_summary["profit"] / age_summary["revenue"]).round(4)

# Aggregate by gender
gender_summary = df_period.groupby("gender", as_index=False).agg({
    "units_sold": "sum",
    "revenue": "sum",
    "cost": "sum",
    "profit": "sum"
})
gender_summary["profit_margin"] = (gender_summary["profit"] / gender_summary["revenue"]).round(4)

age_summary.head(), gender_summary.head()

(  age_group  units_sold    revenue      cost    profit  profit_margin
 0     18-34       45710  103056.10  72360.28  30695.82         0.2979
 1     35-54       47799  111644.73  78567.32  33077.41         0.2963
 2       55+       51362  117087.23  81516.95  35570.28         0.3038
 3       <18       43546   97816.10  68871.10  28945.00         0.2959,
               gender  units_sold    revenue      cost    profit  profit_margin
 0             Female       49779  114197.98  79357.35  34840.63         0.3051
 1               Male       48804  111580.64  77989.77  33590.87         0.3010
 2         Non-binary       43024   97571.38  68545.91  29025.47         0.2975
 3  Prefer not to say       46810  106254.16  75422.62  30831.54         0.2902)

In [6]:
# cell 6

# Full detailed dataset
df.to_csv("good_humor_sales_demographics_detailed.csv", index=False)

# Monthly summary
monthly_summary.to_csv("good_humor_sales_monthly_summary.csv", index=False)

# Demographic summaries
age_summary.to_csv("good_humor_sales_by_age.csv", index=False)
gender_summary.to_csv("good_humor_sales_by_gender.csv", index=False)

print("CSV files saved.")

CSV files saved.
