In [1]:
# Cell 1: imports and basic settings
import numpy as np
import pandas as pd
import datetime
import random

# For reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
# Cell 2: settings
NUM_ROWS = 5_000   # adjust within 1kâ€“10k if desired

In [3]:
# Cell 3: define states/regions (example from franchise trade areas list; adjust as needed)
states = [
    "AL", "AR", "AZ", "DE", "FL", "IN", "KS", "KY", "LA", "MI", "MN", "MO", 
    "MS", "NJ", "NE", "NV", "OK", "PA", "SC", "TX", "UT", "WI", 
    "GA", "IL", "MD", "NC", "OH", "TN", "VA",  # additional from selected trade areas
    # Note: exclude states listed as not available if you want strictly accurate
]

In [4]:
# Cell 4: generate mock store IDs per state
def generate_store_ids(states, avg_stores_per_state=3):
    store_list = []
    for st in states:
        # random number of stores per state, at least 1
        n = max(1, int(np.random.poisson(avg_stores_per_state)))
        for i in range(n):
            store_list.append(f"{st}_S{i+1:03d}")
    return store_list

store_ids = generate_store_ids(states, avg_stores_per_state=2)
len(store_ids)

63

In [5]:
# Cell 5: generate date range
start_date = datetime.date(2023, 1, 1)
end_date = datetime.date(2024, 12, 31)

# Weekly periods
dates = pd.date_range(start=start_date, end=end_date, freq='W-MON')  # weekly on Mondays
len(dates)

105

In [6]:
# Cell 6: prepare DataFrame skeleton
store_date_pairs = pd.MultiIndex.from_product([store_ids, dates], names=["store_id", "week_start"]).to_frame(index=False)

# If too many rows, downsample
if len(store_date_pairs) > NUM_ROWS:
    store_date_pairs = store_date_pairs.sample(n=NUM_ROWS, random_state=RANDOM_SEED).reset_index(drop=True)

df = store_date_pairs.copy()
len(df)

5000

In [7]:
# Cell 7: simulate base weekly sales
# Mean weekly sales around 19k, but vary per store: add store-level multiplier
store_multiplier = {sid: np.random.normal(1.0, 0.3) for sid in store_ids}  # +-30% variation

def simulate_weekly_sales(row):
    base = 19_200  # base weekly avg
    mult = store_multiplier[row['store_id']]
    # seasonal effect: higher in summer months
    month = row['week_start'].month
    if month in [6,7,8]:  # summer boost
        season_mult = 1.3
    elif month in [12,1]:  # holiday boost
        season_mult = 1.2
    else:
        season_mult = 1.0
    # noise
    noise = np.random.normal(0, 3_000)
    return max(0, base * mult * season_mult + noise)

df['weekly_sales'] = df.apply(simulate_weekly_sales, axis=1)

In [8]:
# Cell 8: simulate profit margin percentage
def simulate_margin():
    # mean 15%, sd 3%, constrained between 5% and 25%
    m = np.random.normal(0.15, 0.03)
    return float(np.clip(m, 0.05, 0.25))

df['profit_margin_pct'] = [simulate_margin() for _ in range(len(df))]
df['weekly_profit'] = df['weekly_sales'] * df['profit_margin_pct']

In [9]:
# Cell 9: define categories
age_groups = ['<18', '18-34', '35-54', '55+']
income_levels = ['Low', 'Middle', 'High']
loyalty_levels = ['None', 'Occasional', 'Regular', 'VIP']

# Simulate demographic dominance per store
store_demo = {}
for sid in store_ids:
    # random distribution weights, normalized
    age_w = np.random.dirichlet(np.ones(len(age_groups)))
    inc_w = np.random.dirichlet(np.ones(len(income_levels)))
    loyalty_w = np.random.dirichlet(np.ones(len(loyalty_levels)))
    store_demo[sid] = {
        'age_weights': age_w,
        'income_weights': inc_w,
        'loyalty_weights': loyalty_w
    }

def sample_category(weights, categories):
    return np.random.choice(categories, p=weights)

df['age_group'] = df['store_id'].apply(lambda s: sample_category(store_demo[s]['age_weights'], age_groups))
df['income_level'] = df['store_id'].apply(lambda s: sample_category(store_demo[s]['income_weights'], income_levels))
df['loyalty_level'] = df['store_id'].apply(lambda s: sample_category(store_demo[s]['loyalty_weights'], loyalty_levels))

In [10]:
# Cell 10: extract state
df['state'] = df['store_id'].str.split('_').str[0]

In [11]:
# Cell 11: sanity checks
print(df[['weekly_sales', 'weekly_profit', 'profit_margin_pct']].describe())

# count rows by age group
print(df['age_group'].value_counts())

# sample
df.head()

       weekly_sales  weekly_profit  profit_margin_pct
count   5000.000000    5000.000000        5000.000000
mean   21417.854278    3199.758312           0.149371
std     7556.460537    1326.902558           0.030611
min        0.000000       0.000000           0.050000
25%    16452.134432    2262.324184           0.128956
50%    21072.737692    3079.249683           0.149234
75%    26039.663637    3966.147906           0.169997
max    56353.951111   10597.186613           0.250000
age_group
<18      1309
18-34    1257
55+      1251
35-54    1183
Name: count, dtype: int64


Unnamed: 0,store_id,week_start,weekly_sales,profit_margin_pct,weekly_profit,age_group,income_level,loyalty_level,state
0,PA_S001,2024-07-08,28582.272059,0.154137,4405.599834,55+,Low,VIP,PA
1,MO_S001,2024-10-21,19531.296834,0.165615,3234.676828,18-34,Middle,Occasional,MO
2,PA_S002,2024-10-14,21012.976993,0.162783,3420.557408,18-34,Low,VIP,PA
3,GA_S001,2024-08-19,25143.85778,0.151218,3802.214512,55+,High,Regular,GA
4,MD_S004,2024-03-11,21778.258583,0.192978,4202.715715,18-34,Middle,,MD


In [12]:
# Cell 12: optional expansion into subrows by loyalty level
def expand_by_loyalty(row):
    loyalty_weights = store_demo[row['store_id']]['loyalty_weights']
    records = []
    for lvl, w in zip(loyalty_levels, loyalty_weights):
        rec = row.copy()
        rec['loyalty_level'] = lvl
        rec['weekly_sales'] = row['weekly_sales'] * w
        rec['weekly_profit'] = rec['weekly_sales'] * row['profit_margin_pct']
        records.append(rec)
    return records

# Example: expand first 10 rows to see result
sample_expanded = []
for i, r in df.head(10).iterrows():
    sample_expanded.extend(expand_by_loyalty(r))

expanded_df = pd.DataFrame(sample_expanded)
expanded_df.head()

Unnamed: 0,store_id,week_start,weekly_sales,profit_margin_pct,weekly_profit,age_group,income_level,loyalty_level,state
0,PA_S001,2024-07-08,9499.63424,0.154137,1464.249831,55+,Low,,PA
0,PA_S001,2024-07-08,2723.316683,0.154137,419.765213,55+,Low,Occasional,PA
0,PA_S001,2024-07-08,7631.298008,0.154137,1176.269163,55+,Low,Regular,PA
0,PA_S001,2024-07-08,8728.023129,0.154137,1345.315627,55+,Low,VIP,PA
1,MO_S001,2024-10-21,2900.085872,0.165615,480.297885,18-34,Middle,,MO


In [13]:
# Cell 13: save dataset
OUTPUT_PATH = "jenis_sales_demo_dataset.csv"
df.to_csv(OUTPUT_PATH, index=False)
print("Saved to", OUTPUT_PATH)

Saved to jenis_sales_demo_dataset.csv
