In [1]:
import numpy as np
import pandas as pd
from dataclasses import dataclass

np.random.seed(42)


In [2]:
@dataclass
class CompanyProfile:
    name: str
    base_price_multiplier: float         # affects retail price
    discount_aggressiveness: float       # 0..1; higher => more discounting
    footfall_multiplier: float           # affects demand volume
    online_share: float                  # fraction of sales online
    city_store_share: float              # fraction of stores in cities
    loyalty_share: float                 # fraction of customers who are loyalty members

company_A = CompanyProfile(
    name="High Street Books",
    base_price_multiplier=1.08,
    discount_aggressiveness=0.35,
    footfall_multiplier=1.12,
    online_share=0.28,
    city_store_share=0.70,
    loyalty_share=0.42
)

company_B = CompanyProfile(
    name="ValueReads",
    base_price_multiplier=0.98,
    discount_aggressiveness=0.60,
    footfall_multiplier=1.00,
    online_share=0.35,
    city_store_share=0.45,
    loyalty_share=0.35
)

companies = [company_A, company_B]


In [3]:
regions = [
    "London", "South East", "South West", "East of England", "West Midlands",
    "East Midlands", "North West", "North East", "Yorkshire and the Humber"
]

# We'll define store "types" to influence demand and customer mix
store_types = ["City Centre", "Suburban", "Market Town"]


In [4]:
erb_titles = [
    "Tarzan of the Apes",
    "The Return of Tarzan",
    "The Beasts of Tarzan",
    "A Princess of Mars",
    "The Gods of Mars",
    "The Warlord of Mars",
    "At the Earth's Core",
    "The Land That Time Forgot"
]


In [5]:
def make_stores(company: CompanyProfile, n_stores: int = 45) -> pd.DataFrame:
    # Choose store type distribution based on city_store_share
    # City Centre share maps strongly to "City Centre" stores.
    city_share = company.city_store_share
    suburban_share = min(0.45, 1 - city_share)  # cap for realism
    market_share = 1 - city_share - suburban_share

    probs = np.array([city_share, suburban_share, market_share])
    probs = probs / probs.sum()

    store_type = np.random.choice(store_types, size=n_stores, p=probs)
    region = np.random.choice(regions, size=n_stores)

    # A simple store size proxy:
    # City centre stores slightly larger on average
    size_map = {"City Centre": (220, 60), "Suburban": (170, 50), "Market Town": (140, 40)}
    floor_space = [max(60, int(np.random.normal(size_map[t][0], size_map[t][1]))) for t in store_type]

    return pd.DataFrame({
        "company": company.name,
        "store_id": [f"{company.name[:2].upper()}-{i:03d}" for i in range(1, n_stores+1)],
        "region": region,
        "store_type": store_type,
        "floor_space_m2": floor_space
    })


def expected_weekly_units(company: CompanyProfile, store_row, title, fmt, channel) -> float:
    # Baseline demand by title popularity
    title_popularity = {
        "Tarzan of the Apes": 1.00,
        "The Return of Tarzan": 0.75,
        "The Beasts of Tarzan": 0.55,
        "A Princess of Mars": 0.90,
        "The Gods of Mars": 0.60,
        "The Warlord of Mars": 0.50,
        "At the Earth's Core": 0.55,
        "The Land That Time Forgot": 0.65
    }
    base = 10.0 * title_popularity.get(title, 0.5)

    # Store effects
    store_type_multiplier = {"City Centre": 1.25, "Suburban": 1.00, "Market Town": 0.85}
    space_multiplier = np.clip(store_row["floor_space_m2"] / 180.0, 0.6, 1.6)

    # Format effects
    fmt_multiplier = {"Paperback": 1.00, "Hardback": 0.35, "eBook": 0.55}

    # Channel effects
    channel_multiplier = {"In-Store": 1.00, "Online": 0.85}

    # Company effect
    company_mult = company.footfall_multiplier

    lam = base * store_type_multiplier[store_row["store_type"]] * space_multiplier * fmt_multiplier[fmt] * channel_multiplier[channel] * company_mult

    # Keep it in a realistic range
    return float(np.clip(lam, 0.05, 120.0))


def retail_price(title, fmt, company: CompanyProfile) -> float:
    # Baseline UK-ish price points (synthetic)
    base_price = {
        "Paperback": np.random.normal(9.99, 1.25),
        "Hardback": np.random.normal(18.99, 2.00),
        "eBook": np.random.normal(5.99, 1.00)
    }[fmt]

    # Small title-based premium
    title_premium = 0.7 if ("Tarzan" in title or "Mars" in title) else 0.0

    price = (base_price + title_premium) * company.base_price_multiplier
    return float(max(1.99, round(price, 2)))


def discount_rate(company: CompanyProfile, channel: str, loyalty_member: bool) -> float:
    # Company-wide discount policy
    base = np.random.beta(2, 6) * company.discount_aggressiveness  # skew low, scaled up by aggressiveness

    # Online tends to have more discounting
    if channel == "Online":
        base += 0.03 + 0.05 * company.discount_aggressiveness

    # Loyalty members often get a bit more
    if loyalty_member:
        base += 0.02 + np.random.uniform(0.00, 0.03)

    return float(np.clip(base, 0.0, 0.55))


def unit_cost(retail_price: float, fmt: str) -> float:
    # Synthetic wholesale/COGS model:
    # - Paperback typically ~45-55% of RRP
    # - Hardback ~40-50%
    # - eBook royalties/platform costs lower but non-zero
    if fmt == "Paperback":
        c = retail_price * np.random.uniform(0.45, 0.58)
    elif fmt == "Hardback":
        c = retail_price * np.random.uniform(0.40, 0.52)
    else:
        c = retail_price * np.random.uniform(0.15, 0.30)

    return float(round(c, 2))


In [11]:
def expected_weekly_units(company, store_row, title, fmt, channel):

    base = 12

    store_type_multiplier = {
        "Flagship": 1.5,
        "Mall": 1.2,
        "High Street": 1.0,
        "Outlet": 0.7
    }

    fmt_multiplier = {
        "Physical": 0.8,
        "Digital": 1.2
    }

    channel_multiplier = {
        "Retail": 1.0,
        "Online": 1.3
    }

    space_multiplier = store_row["floor_space_m2"] / 1000

    company_mult = company.footfall_multiplier

    lam = (
        base
        * store_type_multiplier.get(store_row["store_type"], 1.0)
        * space_multiplier
        * fmt_multiplier.get(fmt, 1.0)
        * channel_multiplier.get(channel, 1.0)
        * company_mult
    )

    return float(np.clip(lam, 0.05, 120.0))


In [19]:
import pandas as pd

# Step 1: Load dataset
sales_df = pd.read_csv("sales.csv")

# Step 2: Clean column names (remove spaces, lowercase)
sales_df.columns = sales_df.columns.str.strip().str.lower()

# Step 3: Show available columns
print("Available columns:")
print(sales_df.columns.tolist())

# Step 4: Check if required columns exist
required_cols = ["company", "week", "units", "revenue", "cogs_total", "profit", "profit_margin", "discount_rate"]

missing_cols = [col for col in required_cols if col not in sales_df.columns]

if missing_cols:
    print("\nMissing columns:", missing_cols)
    print("Cannot group data without these columns.")
else:
    # Step 5: Group by company and week
    weekly_company = (
        sales_df.groupby(["company", "week"], as_index=False)
        .agg(
            units=("units", "sum"),
            revenue=("revenue", "sum"),
            cogs=("cogs_total", "sum"),
            profit=("profit", "sum"),
            avg_margin=("profit_margin", "mean"),
            avg_discount=("discount_rate", "mean")
        )
    )

    # Step 6: Show result
    print("\nWeekly company summary:")
    print(weekly_company.head())



Available columns:
['store', 'company', 'book', 'units_sold_weekly', 'revenue_weekly', 'profit']

Missing columns: ['week', 'units', 'revenue', 'cogs_total', 'profit_margin', 'discount_rate']
Cannot group data without these columns.


In [21]:
print(sales_df.columns.tolist())


['store', 'company', 'book', 'units_sold_weekly', 'revenue_weekly', 'profit']


In [33]:
import pandas as pd

# --- Step 1: Clean column names ---
sales_df.columns = sales_df.columns.str.strip().str.lower()

# --- Step 2: Print columns so you know their order ---
print("Columns in dataset:", sales_df.columns.tolist())

# --- Step 3: Assign columns based on your dataset ---
company_col = sales_df.columns[0]
age_band_col = sales_df.columns[1]
units_col = sales_df.columns[2]
revenue_col = sales_df.columns[3]
profit_col = sales_df.columns[4]

# --- Step 4: Convert numeric columns from string to number ---
sales_df[units_col] = pd.to_numeric(sales_df[units_col], errors="coerce")
sales_df[revenue_col] = pd.to_numeric(sales_df[revenue_col], errors="coerce")
sales_df[profit_col] = pd.to_numeric(sales_df[profit_col], errors="coerce")

# --- Step 5: Group and aggregate ---
age_mix = (
    sales_df.groupby([company_col, age_band_col], as_index=False)
    .agg(
        units=(units_col, "sum"),
        revenue=(revenue_col, "sum"),
        profit=(profit_col, "sum")
    )
)

# --- Step 6: Calculate unit share ---
age_mix["unit_share"] = (
    age_mix["units"] /
    age_mix.groupby(company_col)["units"].transform("sum")
)

# --- Step 7: Sort ---
age_mix = age_mix.sort_values(
    [company_col, "unit_share"],
    ascending=[True, False]
)

# --- Step 8: Display ---
print(age_mix.head(20))


Columns in dataset: ['store', 'company', 'book', 'units_sold_weekly', 'revenue_weekly', 'profit']
        store      company  units  revenue  profit  unit_share
0  Birmingham      WHSmith    0.0       48  422.52         NaN
1      London      WHSmith    0.0       60  551.90         NaN
2      London  Waterstones    0.0       70  645.80         NaN
3  Manchester  Waterstones    0.0       63  603.87         NaN


In [41]:
import pandas as pd

df = pd.read_csv("sales.csv")

print(df.columns)


Index(['store', 'company', 'book', 'units_sold_weekly', 'revenue_weekly',
       'profit'],
      dtype='object')


In [43]:
import pandas as pd

# Load dataset
df = pd.read_csv("sales.csv")

# Show columns
print("Columns in dataset:", df.columns.tolist())

# Group by company and calculate mean for ALL numeric columns
company_discount_margin = (
    df.groupby("company", as_index=False)
    .mean(numeric_only=True)
)

# Display result
company_discount_margin


Columns in dataset: ['store', 'company', 'book', 'units_sold_weekly', 'revenue_weekly', 'profit']


Unnamed: 0,company,units_sold_weekly,revenue_weekly,profit
0,WHSmith,10.8,97.442,7.79536
1,Waterstones,13.3,124.967,14.99604


In [55]:
import os
print(os.listdir())


['Sycho.ipynb', 'Hugin_Fm_Base_cleaned.csv', 'Untitled Folder', 'world-universities_cleaned.csv', 'pret_menu.csv', 'decision_tree_intro.ipynb', 'CHECKOUT.ipynb', 'stress.csv', 'Dailyprices.ipynb', 'DSS.ipynb', 'Predicito.ipynb', 'degrees-that-pay-back.csv', 'Patrick.ipynb', 'iso.csv', 'hr-employee-attrition.csv', 'all_pivoted.csv', 'Compare.ipynb', 'Cod.ipynb', 'chiquito_taco_bell_uk_dataset.ipynb', '.meteostat', 'Miscellaneous.ipynb', 'profit_dataset.csv', 'first_decision_tree.ipynb', 'wrestling_ppv_sales.ipynb', 'Machine.Learning.Task.ipynb', 'DFA.ipynb', 'presidents.csv', 'nuclear_waste_sites_on_american_campuses_cleaned.csv', 'Biden.ipynb', 'constituents.csv', 'amazon_data.ipynb', 'Shawn.ipynb', 'NASG.ipynb', 'salford_hospitality_analysis.csv', 'Attribute_DataSet_cleaned.csv', 'Used_Car_Price_Data_Cleaned.csv', 'parks_cleaned.csv', 'groups.csv', 'Vinces.ipynb', 'Ellsworth.ipynb', 'pizza_kb.sqlite', 'miscellaneous.csv', 'boysnames.csv', '500hits 3.csv', 'Twitter Stock Market Dataset