In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

rng = np.random.default_rng(42)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)


In [2]:
companies = ["Waterstones", "WHSmith"]  # change these if you want different companies

regions_england = [
    "London", "South East", "South West", "East of England", "West Midlands",
    "East Midlands", "North West", "North East", "Yorkshire and the Humber"
]

titles = [
    "Trainspotting", "Porno", "Filth", "Glue",
    "Skagboys", "The Acid House", "Marabou Stork Nightmares", "Crime"
]

formats = ["Paperback", "Hardback", "eBook", "Audiobook"]


In [3]:
company_logic = {
    "Waterstones": {
        "positioning": "Book-led retailer; higher discovery browsing; stronger fiction depth.",
        "avg_discount": 0.08,
        "footfall_multiplier": 1.10,
        "conversion_multiplier": 1.05,
        "format_mix_bias": {"Paperback": 1.20, "Hardback": 1.00, "eBook": 0.70, "Audiobook": 0.75},
    },
    "WHSmith": {
        "positioning": "Convenience-led; travel hubs/high-street; more impulse buys; broader non-book mix.",
        "avg_discount": 0.12,
        "footfall_multiplier": 1.25,
        "conversion_multiplier": 0.90,
        "format_mix_bias": {"Paperback": 1.05, "Hardback": 0.85, "eBook": 0.85, "Audiobook": 1.10},
    }
}


In [4]:
base_list_price = {
    "Paperback": 9.99,
    "Hardback": 18.99,
    "eBook": 6.99,
    "Audiobook": 14.99
}

base_cost_ratio = {
    "Paperback": 0.45,
    "Hardback": 0.42,
    "eBook": 0.25,
    "Audiobook": 0.30
}


In [5]:
def make_stores(n_per_company=70):
    rows = []
    city_tiers = ["Tier 1 (major city)", "Tier 2 (large town)", "Tier 3 (small town)"]
    tier_probs = [0.35, 0.45, 0.20]

    for company in companies:
        for i in range(n_per_company):
            region = rng.choice(regions_england)
            tier = rng.choice(city_tiers, p=tier_probs)

            tier_base = {
                "Tier 1 (major city)": 3800,
                "Tier 2 (large town)": 2200,
                "Tier 3 (small town)": 1200
            }[tier]

            region_boost = 1.25 if region == "London" else (1.10 if region in ["South East", "North West"] else 1.00)

            base_footfall = int(tier_base * region_boost * rng.normal(1.0, 0.10))
            rows.append({
                "store_id": f"{company[:2].upper()}-{i:03d}",
                "company": company,
                "region": region,
                "city_tier": tier,
                "base_weekly_footfall": max(250, base_footfall)
            })

    return pd.DataFrame(rows)

stores = make_stores(n_per_company=70)
stores.head()


Unnamed: 0,store_id,company,region,city_tier,base_weekly_footfall
0,WA-000,Waterstones,London,Tier 2 (large town),2956
1,WA-001,Waterstones,North West,Tier 2 (large town),1947
2,WA-002,Waterstones,West Midlands,Tier 2 (large town),2130
3,WA-003,Waterstones,Yorkshire and the Humber,Tier 1 (major city),3475
4,WA-004,Waterstones,West Midlands,Tier 3 (small town),1207


In [6]:
age_bands = ["16-24", "25-34", "35-44", "45-54", "55+"]
segments = ["Students", "Young Professionals", "Working Class", "Arts/Media", "Commuters", "Tourists"]

def dirichlet_mix(k, concentration=10.0):
    alpha = np.ones(k) * concentration / k
    return rng.dirichlet(alpha)

def assign_store_demographics(stores_df):
    out = stores_df.copy()

    tier_age_profiles = {
        "Tier 1 (major city)": np.array([0.18, 0.28, 0.22, 0.17, 0.15]),
        "Tier 2 (large town)": np.array([0.15, 0.22, 0.22, 0.20, 0.21]),
        "Tier 3 (small town)": np.array([0.12, 0.18, 0.20, 0.22, 0.28]),
    }

    for idx, row in out.iterrows():
        tier = row["city_tier"]
        company = row["company"]

        age_mix = tier_age_profiles[tier] + rng.normal(0, 0.02, size=len(age_bands))
        age_mix = np.clip(age_mix, 0.01, None)
        age_mix = age_mix / age_mix.sum()

        seg_mix = dirichlet_mix(len(segments), concentration=10.0)

        seg_bias = np.ones(len(segments))
        seg_bias[segments.index("Students")] *= 1.15 if tier != "Tier 3 (small town)" else 0.95
        seg_bias[segments.index("Tourists")] *= 1.35 if (tier == "Tier 1 (major city)" and row["region"] == "London") else 0.85
        seg_bias[segments.index("Commuters")] *= 1.25 if company == "WHSmith" else 0.95
        seg_bias[segments.index("Arts/Media")] *= 1.20 if company == "Waterstones" else 0.90
        seg_bias[segments.index("Working Class")] *= 1.10 if row["region"] in ["North West", "North East", "Yorkshire and the Humber"] else 0.95

        seg_mix = seg_mix * seg_bias
        seg_mix = seg_mix / seg_mix.sum()

        for a, v in zip(age_bands, age_mix):
            out.loc[idx, f"age_{a}"] = v
        for s, v in zip(segments, seg_mix):
            out.loc[idx, f"seg_{s}"] = v

    return out

stores_demo = assign_store_demographics(stores)
stores_demo.head()


Unnamed: 0,store_id,company,region,city_tier,base_weekly_footfall,age_16-24,age_25-34,age_35-44,age_45-54,age_55+,seg_Students,seg_Young Professionals,seg_Working Class,seg_Arts/Media,seg_Commuters,seg_Tourists
0,WA-000,Waterstones,London,Tier 2 (large town),2956,0.155023,0.219931,0.226721,0.23074,0.167585,0.10999,0.026907,0.215677,0.117205,0.356784,0.173438
1,WA-001,Waterstones,North West,Tier 2 (large town),1947,0.132152,0.216099,0.216,0.206759,0.22899,0.037606,0.28415,0.047645,0.225454,0.245734,0.159411
2,WA-002,Waterstones,West Midlands,Tier 2 (large town),2130,0.191701,0.210567,0.175488,0.226356,0.195888,0.250915,0.030898,0.029399,0.341762,0.237996,0.10903
3,WA-003,Waterstones,Yorkshire and the Humber,Tier 1 (major city),3475,0.185295,0.270022,0.224603,0.166717,0.153364,0.105897,0.215023,0.139537,0.172049,0.329552,0.037941
4,WA-004,Waterstones,West Midlands,Tier 3 (small town),1207,0.076943,0.174641,0.225589,0.224869,0.297957,0.350333,0.152855,0.141529,0.210826,0.087701,0.056756


In [7]:
def demand_score(store_row):
    age_weight = (
        0.90 * store_row["age_16-24"] +
        1.20 * store_row["age_25-34"] +
        1.15 * store_row["age_35-44"] +
        0.95 * store_row["age_45-54"] +
        0.80 * store_row["age_55+"]
    )
    seg_weight = (
        1.15 * store_row["seg_Students"] +
        1.10 * store_row["seg_Working Class"] +
        1.10 * store_row["seg_Arts/Media"] +
        0.95 * store_row["seg_Young Professionals"] +
        1.00 * store_row["seg_Commuters"] +
        0.90 * store_row["seg_Tourists"]
    )
    return float(age_weight * seg_weight)

title_popularity = {
    "Trainspotting": 1.55,
    "Filth": 1.25,
    "Skagboys": 1.10,
    "Glue": 1.05,
    "Porno": 0.95,
    "The Acid House": 0.85,
    "Marabou Stork Nightmares": 0.75,
    "Crime": 0.90,
}

format_base_pref = {"Paperback": 1.20, "Hardback": 0.65, "eBook": 0.35, "Audiobook": 0.45}


In [8]:
def generate_weekly_sales(stores_df, weeks=26):
    rows = []
    for week in range(1, weeks + 1):
        seasonality = 1.00 + 0.08*np.sin(2*np.pi*(week/weeks))

        for _, st in stores_df.iterrows():
            company = st["company"]
            logic = company_logic[company]

            footfall = st["base_weekly_footfall"] * logic["footfall_multiplier"] * rng.normal(1.0, 0.08)
            footfall = max(100, footfall)

            dscore = demand_score(st) * logic["conversion_multiplier"] * seasonality

            base_interest = 0.0022  # % of visitors buying any Irvine Welsh item (synthetic)
            expected_buyers = footfall * base_interest * dscore
            buyers = rng.poisson(lam=max(0.05, expected_buyers))

            if buyers == 0:
                continue

            avg_items_per_buyer = 1.08
            total_items = rng.poisson(lam=max(1, buyers*avg_items_per_buyer))

            probs = []
            combos = []
            for t in titles:
                for f in formats:
                    w = title_popularity[t] * format_base_pref[f] * logic["format_mix_bias"][f]
                    probs.append(w)
                    combos.append((t, f))
            probs = np.array(probs, dtype=float)
            probs = probs / probs.sum()

            choices = rng.choice(len(combos), size=total_items, p=probs)

            counts = {}
            for c in choices:
                counts[combos[c]] = counts.get(combos[c], 0) + 1

            for (t, f), units in counts.items():
                list_price = base_list_price[f] * rng.normal(1.0, 0.02)
                discount = max(0.0, rng.normal(logic["avg_discount"], 0.03))
                discount = min(discount, 0.35)
                sell_price = list_price * (1 - discount)

                cost_ratio = base_cost_ratio[f] * rng.normal(1.0, 0.03)
                unit_cost = list_price * cost_ratio

                revenue = units * sell_price
                cogs = units * unit_cost
                gross_profit = revenue - cogs

                rows.append({
                    "week": week,
                    "store_id": st["store_id"],
                    "company": company,
                    "region": st["region"],
                    "city_tier": st["city_tier"],
                    "title": t,
                    "format": f,
                    "units": units,
                    "list_price_gbp": round(list_price, 2),
                    "discount_pct": round(discount, 3),
                    "avg_sell_price_gbp": round(sell_price, 2),
                    "unit_cost_gbp": round(unit_cost, 2),
                    "revenue_gbp": round(revenue, 2),
                    "cogs_gbp": round(cogs, 2),
                    "gross_profit_gbp": round(gross_profit, 2),
                })

    return pd.DataFrame(rows)

sales = generate_weekly_sales(stores_demo, weeks=26)
sales.head(), sales.shape


(   week store_id      company      region            city_tier     title     format  units  list_price_gbp  discount_pct  avg_sell_price_gbp  unit_cost_gbp  \
 0     1   WA-000  Waterstones      London  Tier 2 (large town)      Glue  Audiobook      1           14.63         0.099               13.18           4.43   
 1     1   WA-000  Waterstones      London  Tier 2 (large town)     Porno      eBook      1            7.09         0.106                6.34           1.74   
 2     1   WA-000  Waterstones      London  Tier 2 (large town)  Skagboys  Paperback      1            9.67         0.064                9.05           4.47   
 3     1   WA-001  Waterstones  North West  Tier 2 (large town)     Porno  Paperback      2            9.98         0.063                9.35           4.43   
 4     1   WA-001  Waterstones  North West  Tier 2 (large town)  Skagboys  Paperback      1           10.29         0.077                9.50           4.52   
 
    revenue_gbp  cogs_gbp  gross_profi

In [9]:
demo_cols = [c for c in stores_demo.columns if c.startswith("age_") or c.startswith("seg_")]

sales_enriched = sales.merge(
    stores_demo[["store_id"] + demo_cols],
    on="store_id",
    how="left"
)

sales_enriched.head()


Unnamed: 0,week,store_id,company,region,city_tier,title,format,units,list_price_gbp,discount_pct,avg_sell_price_gbp,unit_cost_gbp,revenue_gbp,cogs_gbp,gross_profit_gbp,age_16-24,age_25-34,age_35-44,age_45-54,age_55+,seg_Students,seg_Young Professionals,seg_Working Class,seg_Arts/Media,seg_Commuters,seg_Tourists
0,1,WA-000,Waterstones,London,Tier 2 (large town),Glue,Audiobook,1,14.63,0.099,13.18,4.43,13.18,4.43,8.75,0.155023,0.219931,0.226721,0.23074,0.167585,0.10999,0.026907,0.215677,0.117205,0.356784,0.173438
1,1,WA-000,Waterstones,London,Tier 2 (large town),Porno,eBook,1,7.09,0.106,6.34,1.74,6.34,1.74,4.6,0.155023,0.219931,0.226721,0.23074,0.167585,0.10999,0.026907,0.215677,0.117205,0.356784,0.173438
2,1,WA-000,Waterstones,London,Tier 2 (large town),Skagboys,Paperback,1,9.67,0.064,9.05,4.47,9.05,4.47,4.58,0.155023,0.219931,0.226721,0.23074,0.167585,0.10999,0.026907,0.215677,0.117205,0.356784,0.173438
3,1,WA-001,Waterstones,North West,Tier 2 (large town),Porno,Paperback,2,9.98,0.063,9.35,4.43,18.71,8.86,9.85,0.132152,0.216099,0.216,0.206759,0.22899,0.037606,0.28415,0.047645,0.225454,0.245734,0.159411
4,1,WA-001,Waterstones,North West,Tier 2 (large town),Skagboys,Paperback,1,10.29,0.077,9.5,4.52,9.5,4.52,4.98,0.132152,0.216099,0.216,0.206759,0.22899,0.037606,0.28415,0.047645,0.225454,0.245734,0.159411


In [10]:
weekly_company = (sales_enriched
    .groupby(["company", "week"], as_index=False)
    .agg(units=("units", "sum"),
         revenue_gbp=("revenue_gbp", "sum"),
         gross_profit_gbp=("gross_profit_gbp", "sum"),
         avg_discount_pct=("discount_pct", "mean"))
)

weekly_company["gross_margin_pct"] = weekly_company["gross_profit_gbp"] / weekly_company["revenue_gbp"]
weekly_company.head()


Unnamed: 0,company,week,units,revenue_gbp,gross_profit_gbp,avg_discount_pct,gross_margin_pct
0,WHSmith,1,545,6009.47,3326.19,0.120291,0.553491
1,WHSmith,2,523,5868.86,3226.14,0.119278,0.549705
2,WHSmith,3,521,5698.71,3122.76,0.121019,0.547977
3,WHSmith,4,546,5903.86,3247.86,0.120059,0.550125
4,WHSmith,5,603,6659.63,3675.22,0.117875,0.551865


In [11]:
sim_weeks = sales_enriched["week"].nunique()
annual_factor = 52 / sim_weeks

yearly_company = (sales_enriched
    .groupby("company", as_index=False)
    .agg(units=("units", "sum"),
         revenue_gbp=("revenue_gbp", "sum"),
         gross_profit_gbp=("gross_profit_gbp", "sum"))
)

yearly_company["annualized_units"] = yearly_company["units"] * annual_factor
yearly_company["annualized_revenue_gbp"] = yearly_company["revenue_gbp"] * annual_factor
yearly_company["annualized_gross_profit_gbp"] = yearly_company["gross_profit_gbp"] * annual_factor
yearly_company["annualized_gross_margin_pct"] = yearly_company["annualized_gross_profit_gbp"] / yearly_company["annualized_revenue_gbp"]

yearly_company


Unnamed: 0,company,units,revenue_gbp,gross_profit_gbp,annualized_units,annualized_revenue_gbp,annualized_gross_profit_gbp,annualized_gross_margin_pct
0,WHSmith,14211,157208.28,86856.5,28422.0,314416.56,173713.0,0.552493
1,Waterstones,13907,159377.89,89118.29,27814.0,318755.78,178236.58,0.559163


In [12]:
store_profit = (sales_enriched
    .groupby(["company", "store_id"], as_index=False)
    .agg(gross_profit_gbp=("gross_profit_gbp", "sum"),
         revenue_gbp=("revenue_gbp", "sum"),
         units=("units", "sum"))
    .merge(stores_demo[["store_id"] + demo_cols + ["region", "city_tier"]], on="store_id", how="left")
)

top_stores = (store_profit
              .sort_values(["company", "gross_profit_gbp"], ascending=[True, False])
              .groupby("company")
              .head(15))

top_stores[["company","store_id","region","city_tier","gross_profit_gbp","units","revenue_gbp"]].head(10)


Unnamed: 0,company,store_id,region,city_tier,gross_profit_gbp,units,revenue_gbp
6,WHSmith,WH-006,London,Tier 1 (major city),2580.28,421,4701.02
41,WHSmith,WH-041,North East,Tier 1 (major city),2340.41,382,4239.9
44,WHSmith,WH-044,London,Tier 1 (major city),2294.96,379,4196.06
21,WHSmith,WH-021,London,Tier 1 (major city),2188.75,348,3912.05
37,WHSmith,WH-037,South East,Tier 1 (major city),2142.16,343,3901.98
62,WHSmith,WH-062,Yorkshire and the Humber,Tier 1 (major city),2066.81,321,3731.26
60,WHSmith,WH-060,South East,Tier 1 (major city),2050.21,347,3730.8
68,WHSmith,WH-068,West Midlands,Tier 1 (major city),2035.73,331,3690.23
11,WHSmith,WH-011,Yorkshire and the Humber,Tier 1 (major city),1973.85,334,3573.44
16,WHSmith,WH-016,North East,Tier 1 (major city),1969.25,315,3535.53


In [13]:
sales_enriched.to_csv("irvine_welsh_england_bookstores_synthetic.csv", index=False)
store_profit.to_csv("irvine_welsh_store_profit_synthetic.csv", index=False)

print("Saved CSV files.")


Saved CSV files.
