In [1]:
# 1. Standard imports
import numpy as np
import pandas as pd
import datetime
import random

In [2]:
# 2. Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [3]:
# 3. Dataset size
n_rows = 10_000

# 4. List of flavors for Carte D'Or – example set, tweak to real flavors if known
flavors = [
    "Vanilla",
    "Chocolate",
    "Stracciatella",
    "Hazelnut",
    "Salted Caramel",
    "Mint",
    "Coffee",
    "Berry",
    "Pistachio",
    "Cookies & Cream"
]

# 5. Example locations by region or country
locations = [
    "UK",
    "Germany",
    "France",
    "Australia",
    "Netherlands",
    "Spain",
    "Italy",
    "USA",
    "Canada",
    "Brazil"
]

# 6. Genders
genders = ["Female", "Male", "Non-binary", "Prefer not to say"]

In [4]:
# 7. Age distribution
ages = np.clip(
    np.random.normal(loc=38, scale=14, size=n_rows).astype(int),
    a_min=18,
    a_max=80
)

In [5]:
# 8. Location choices with basic weighting
location_weights = [0.15, 0.12, 0.12, 0.10, 0.10, 0.08, 0.08, 0.10, 0.08, 0.07]
locations_sample = np.random.choice(locations, size=n_rows, p=location_weights)

In [6]:
# 9. Gender choices; adjust weights if needed
gender_weights = [0.48, 0.48, 0.02, 0.02]
genders_sample = np.random.choice(genders, size=n_rows, p=gender_weights)

In [7]:
# 10. Household income synthesis (annual USD equivalent; adjust currency if desired)
income = np.random.lognormal(mean=10.5, sigma=0.5, size=n_rows)  # mean ~ 36k–40k USD, skewed
income = np.round(income, -2)  # round to nearest hundred

In [8]:
# 11. Date range: last 12 months from today
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days=365)

dates = [
    start_date + datetime.timedelta(days=random.randint(0, 365))
    for _ in range(n_rows)
]

In [9]:
# 12. Flavor assigned per transaction
flavors_sample = np.random.choice(flavors, size=n_rows)

In [10]:
# 13. Units sold per transaction
units_sold = np.random.choice([1, 2, 3, 4, 5, 6], size=n_rows, p=[0.5, 0.2, 0.1, 0.1, 0.05, 0.05])

In [11]:
# 14. Base price per unit in USD (example)
base_price = 3.5  # say $3.50 per unit typical price
# Variation by flavor: add or subtract a bit
flavor_price_adj = {f: random.uniform(-0.5, 0.5) for f in flavors}

# 15. Price per row
prices = np.array([
    round(base_price + flavor_price_adj[fl], 2)
    for fl in flavors_sample
])

# 16. Revenue per row
revenue = prices * units_sold

In [12]:
# 17. Profit margin percentage
# Base margin around 0.14 (14%), with variation
base_margin = 0.14
# Flavor-level variation between -3% and +3%
flavor_margin_adj = {f: random.uniform(-0.03, 0.03) for f in flavors}

margins = np.array([
    base_margin + flavor_margin_adj[fl]
    for fl in flavors_sample
])

# 18. Profit dollar amount
profit = revenue * margins

# 19. Round metrics
revenue = np.round(revenue, 2)
profit = np.round(profit, 2)
margins_percent = np.round(margins * 100, 2)

In [13]:
# 20. Build DataFrame
df = pd.DataFrame({
    "sale_date": dates,
    "location": locations_sample,
    "age": ages,
    "gender": genders_sample,
    "household_income": income,
    "flavor": flavors_sample,
    "units_sold": units_sold,
    "unit_price_usd": prices,
    "revenue_usd": revenue,
    "profit_margin_pct": margins_percent,
    "profit_usd": profit
})

df.head()

Unnamed: 0,sale_date,location,age,gender,household_income,flavor,units_sold,unit_price_usd,revenue_usd,profit_margin_pct,profit_usd
0,2026-01-20,France,44,Female,49000.0,Vanilla,3,3.89,11.67,14.22,1.66
1,2025-04-25,UK,36,Female,19600.0,Hazelnut,4,3.44,13.76,16.28,2.24
2,2025-03-11,UK,47,Non-binary,38400.0,Coffee,3,3.65,10.95,12.28,1.34
3,2025-07-17,UK,59,Male,22800.0,Hazelnut,1,3.44,3.44,16.28,0.56
4,2025-07-02,Netherlands,34,Female,18100.0,Hazelnut,3,3.44,10.32,16.28,1.68


In [14]:
df.describe(include='all')

Unnamed: 0,sale_date,location,age,gender,household_income,flavor,units_sold,unit_price_usd,revenue_usd,profit_margin_pct,profit_usd
count,10000,10000,10000.0,10000,10000.0,10000,10000.0,10000.0,10000.0,10000.0,10000.0
unique,366,10,,4,,10,,,,,
top,2026-02-12,UK,,Female,,Pistachio,,,,,
freq,41,1468,,4854,,1041,,,,,
mean,,,37.9983,,40974.24,,2.1789,3.614815,7.875185,13.987848,1.100809
std,,,13.031979,,22140.546217,,1.504434,0.242756,5.468889,1.129183,0.772139
min,,,18.0,,3900.0,,1.0,3.08,3.08,12.28,0.4
25%,,,28.0,,25400.0,,1.0,3.44,3.6425,13.04,0.53
50%,,,37.0,,36100.0,,2.0,3.62,6.16,14.12,0.8
75%,,,47.0,,51100.0,,3.0,3.81,10.95,14.24,1.6


In [15]:
# 21. Extract month
df["month"] = df["sale_date"].apply(lambda d: d.month)

# 22. Add seasonal revenue multiplier
# Example: summer months (Jun-Aug) 1.2x, winter months 0.8x, others 1.0x
season_multiplier = {1: 0.9, 2: 0.9, 3: 1.0, 4: 1.0, 5: 1.1, 6: 1.2,
                     7: 1.2, 8: 1.2, 9: 1.0, 10: 0.9, 11: 0.9, 12: 0.9}

df["season_mult"] = df["month"].map(season_multiplier)
# Adjust revenue and profit
df["revenue_usd"] = np.round(df["revenue_usd"] * df["season_mult"], 2)
df["profit_usd"] = np.round(df["profit_usd"] * df["season_mult"], 2)
# Recompute margin because revenue changed
df["profit_margin_pct"] = np.round(df["profit_usd"] / df["revenue_usd"] * 100, 2)

In [16]:
# 23. Example: higher price in some countries
location_price_adj = {
    "UK": 0.2,
    "Germany": 0.1,
    "France": 0.15,
    "Australia": 0.25,
    "Netherlands": 0.1,
    "Spain": 0.05,
    "Italy": 0.05,
    "USA": 0.3,
    "Canada": 0.25,
    "Brazil": -0.1
}

df["price_adj_loc"] = df["location"].map(location_price_adj)
df["unit_price_usd"] = np.round(df["unit_price_usd"] + df["price_adj_loc"], 2)
df["revenue_usd"] = np.round(df["unit_price_usd"] * df["units_sold"] * df["season_mult"], 2)
df["profit_usd"] = np.round(df["revenue_usd"] * (df["profit_margin_pct"]/100), 2)

In [17]:
# 24. Save to CSV
output_path = "carte_dor_synthetic_sales_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Saved synthetic dataset to {output_path}")

Saved synthetic dataset to carte_dor_synthetic_sales_dataset.csv
