In [1]:
import pandas as pd
import numpy as np
import random



In [2]:
matheson_books = [
    "I Am Legend",
    "Hell House",
    "The Shrinking Man",
    "What Dreams May Come",
    "A Stir of Echoes",
    "Bid Time Return",
    "The Incredible Shrinking Man",
    "Nightmare at 20,000 Feet",
    "Button, Button",
    "Duel"
]


In [3]:
companies = {
    "Waterstones": {
        "stores": 317,
        "avg_price": 12.99,
        "profit_margin": 0.35,
        "customer_income_avg": 42000
    },
    
    "WHSmith": {
        "stores": 580,
        "avg_price": 9.99,
        "profit_margin": 0.22,
        "customer_income_avg": 30000
    }
}


In [4]:
age_groups = [
    "18-24",
    "25-34",
    "35-44",
    "45-54",
    "55-64",
    "65+"
]

gender = ["Male", "Female"]


In [5]:
dataset = []

weeks = 52

for company, info in companies.items():
    
    for week in range(1, weeks+1):
        
        customers = random.randint(500, 2000)
        
        for _ in range(customers):
            
            book = random.choice(matheson_books)
            age = random.choice(age_groups)
            gen = random.choice(gender)
            
            price = info["avg_price"]
            profit = price * info["profit_margin"]
            
            dataset.append([
                company,
                week,
                book,
                age,
                gen,
                price,
                profit
            ])

columns = [
    "Company",
    "Week",
    "Book",
    "Age Group",
    "Gender",
    "Price",
    "Profit"
]

df = pd.DataFrame(dataset, columns=columns)

df.head()


Unnamed: 0,Company,Week,Book,Age Group,Gender,Price,Profit
0,Waterstones,1,What Dreams May Come,55-64,Female,12.99,4.5465
1,Waterstones,1,"Nightmare at 20,000 Feet",18-24,Female,12.99,4.5465
2,Waterstones,1,I Am Legend,45-54,Female,12.99,4.5465
3,Waterstones,1,The Shrinking Man,35-44,Male,12.99,4.5465
4,Waterstones,1,The Incredible Shrinking Man,35-44,Female,12.99,4.5465


In [6]:
weekly_profit = df.groupby(["Company", "Week"])["Profit"].sum().reset_index()

weekly_profit.head()


Unnamed: 0,Company,Week,Profit
0,WHSmith,1,2879.118
1,WHSmith,2,2237.3604
2,WHSmith,3,3153.843
3,WHSmith,4,4215.3804
4,WHSmith,5,3331.8648


In [7]:
yearly_profit = df.groupby("Company")["Profit"].sum()

yearly_profit


Company
WHSmith        151602.0462
Waterstones    284710.9230
Name: Profit, dtype: float64

In [8]:
popular_books = df.groupby("Book").size().sort_values(ascending=False)

popular_books


Book
Hell House                      13470
Bid Time Return                 13393
The Shrinking Man               13242
Nightmare at 20,000 Feet        13222
I Am Legend                     13169
Duel                            13155
The Incredible Shrinking Man    13086
Button, Button                  12987
What Dreams May Come            12947
A Stir of Echoes                12930
dtype: int64

In [9]:
comparison = pd.DataFrame(companies).T

comparison


Unnamed: 0,stores,avg_price,profit_margin,customer_income_avg
Waterstones,317.0,12.99,0.35,42000.0
WHSmith,580.0,9.99,0.22,30000.0


In [10]:
avg_weekly_profit = df.groupby("Company")["Profit"].sum() / 52

avg_weekly_profit


Company
WHSmith        2915.423965
Waterstones    5475.210058
Name: Profit, dtype: float64

In [11]:
df["Revenue"] = df["Price"]

yearly_revenue = df.groupby("Company")["Revenue"].sum()

yearly_revenue


Company
WHSmith        689100.21
Waterstones    813459.78
Name: Revenue, dtype: float64

In [12]:
spending_by_age = df.groupby("Age Group")["Revenue"].sum()

spending_by_age


Age Group
18-24    250880.43
25-34    250581.33
35-44    250266.96
45-54    249801.93
55-64    253742.04
65+      247287.30
Name: Revenue, dtype: float64

In [13]:
df.to_csv("richard_matheson_uk_bookstores_dataset.csv", index=False)
