In [1]:
# core libraries
import numpy as np
import pandas as pd

# for dates
from datetime import datetime

# optional: reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
# 3a) custom date range (change as needed)
start_date = "2023-01-01"
end_date   = "2023-12-31"

# 3b) number of transactions to simulate
n_transactions = 100_000  # adjust up/down

# 3c) demographics distributions: 
#    proportions must sum to 1 when explicitly defined; otherwise code will normalize
income_levels = {
    "Low": 0.25,
    "Middle": 0.50,
    "High": 0.25
}

age_groups = {
    "0-12": 0.15,
    "13-18": 0.10,
    "19-34": 0.30,
    "35-54": 0.30,
    "55+": 0.15
}

regions_states = {
    "NY": 0.10,
    "CA": 0.15,
    "TX": 0.12,
    "FL": 0.10,
    "Other": 0.53
}

genders = {
    "Male": 0.49,
    "Female": 0.49,
    "Non-binary/Other": 0.02
}

# 3d) product categories example
product_categories = [
    "Action Figures",
    "Dolls",
    "Board Games",
    "STEM Toys",
    "Outdoor",
    "Video Games",
    "Puzzles",
    "Building Sets"
]

In [3]:
def random_choice_from_dist(options, dist, size):
    """
    options: list of keys
    dist: dict mapping option->prob
    size: number of draws
    """
    probs = np.array([dist[k] for k in options], dtype=float)
    probs = probs / probs.sum()
    return np.random.choice(options, size=size, p=probs)

def random_dates(start, end, n):
    """Return n random dates between start and end (inclusive)"""
    start_u = pd.Timestamp(start).value // 10**9
    end_u   = pd.Timestamp(end).value // 10**9
    rand_u = np.random.randint(start_u, end_u + 1, n)
    return pd.to_datetime(rand_u, unit='s')

In [4]:
# 5a) transaction ids
transaction_ids = np.arange(1, n_transactions + 1)

# 5b) transaction dates
dates = random_dates(start_date, end_date, n_transactions)

# 5c) product categories
categories = random_choice_from_dist(product_categories,
                                     {cat: 1/len(product_categories) for cat in product_categories},
                                     n_transactions)

# 5d) sales amount simulation
# Example: base sale amounts by category, plus randomness
base_price = {
    "Action Figures": 20,
    "Dolls": 25,
    "Board Games": 30,
    "STEM Toys": 40,
    "Outdoor": 35,
    "Video Games": 60,
    "Puzzles": 15,
    "Building Sets": 50
}

# Random variation: normal around base price, ensure positive
sale_amounts = np.array([
    max(1, np.random.normal(base_price[cat], base_price[cat]*0.3))
    for cat in categories
])

In [5]:
# 6a) cost as a fraction of sales, varying by category
# Example cost ratios; these are synthetic
cost_ratio = {
    "Action Figures": 0.55,
    "Dolls": 0.50,
    "Board Games": 0.45,
    "STEM Toys": 0.60,
    "Outdoor": 0.55,
    "Video Games": 0.65,
    "Puzzles": 0.40,
    "Building Sets": 0.58
}

costs = np.array([sale_amounts[i] * cost_ratio[cat] for i, cat in enumerate(categories)])
profits = sale_amounts - costs
profit_margin = profits / sale_amounts  # fraction

In [6]:
# 7a) Income level
income = random_choice_from_dist(list(income_levels.keys()), income_levels, n_transactions)

# 7b) Age group
age_group = random_choice_from_dist(list(age_groups.keys()), age_groups, n_transactions)

# 7c) Region/state
region = random_choice_from_dist(list(regions_states.keys()), regions_states, n_transactions)

# 7d) Gender
gender = random_choice_from_dist(list(genders.keys()), genders, n_transactions)

In [7]:
df = pd.DataFrame({
    "transaction_id": transaction_ids,
    "transaction_date": dates,
    "product_category": categories,
    "sale_amount": sale_amounts.round(2),
    "cost": costs.round(2),
    "profit": profits.round(2),
    "profit_margin": profit_margin.round(4),
    "income_level": income,
    "age_group": age_group,
    "region_state": region,
    "gender": gender
})

df.head()

Unnamed: 0,transaction_id,transaction_date,product_category,sale_amount,cost,profit,profit_margin,income_level,age_group,region_state,gender
0,1,2023-09-26 12:36:44,Outdoor,34.47,18.96,15.51,0.45,Middle,35-54,Other,Female
1,2,2023-07-06 06:41:18,Puzzles,12.79,5.12,7.68,0.6,Low,19-34,Other,Female
2,3,2023-09-27 23:57:30,STEM Toys,24.86,14.92,9.94,0.4,Middle,13-18,Other,Male
3,4,2023-11-07 20:42:47,Dolls,22.75,11.37,11.37,0.5,Middle,35-54,TX,Male
4,5,2023-09-02 00:03:08,STEM Toys,31.2,18.72,12.48,0.4,Low,13-18,NY,Male


In [8]:
# 9a) ensure no negative prices or costs
assert (df["sale_amount"] > 0).all()
assert (df["cost"] >= 0).all()

# 9b) profit calculations sanity
assert np.allclose(df["profit"], df["sale_amount"] - df["cost"], atol=0.01)

# 9c) date range sanity
print(df["transaction_date"].min(), df["transaction_date"].max())

2023-01-01 00:00:37 2023-12-30 23:59:28


In [9]:
output_csv = "synthetic_toysrus_sales.csv"
df.to_csv(output_csv, index=False)
print(f"Exported to {output_csv}")

Exported to synthetic_toysrus_sales.csv
