In [1]:
# Step 2: Import libraries

import pandas as pd
import numpy as np
import random

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Step 3: Define dataset size

num_records = 500

num_records

500

In [3]:
# Step 4: HBR Books list

books = [
    "HBR Guide to Data Analytics",
    "HBR Emotional Intelligence",
    "HBR Guide to Leadership",
    "HBR Guide to Better Business Writing",
    "HBR Guide to Project Management",
    "HBR Guide to Finance Basics",
    "HBR Guide to Strategic Thinking",
    "HBR Guide to Remote Work",
    "HBR Guide to Managing People",
    "HBR Guide to Negotiation"
]

In [4]:
# Step 5: Demographics

age_groups = ["18-25", "26-35", "36-45", "46-60", "60+"]
genders = ["Male", "Female", "Non-Binary"]
regions = ["North America", "Europe", "Asia", "South America", "Africa"]
income_levels = ["Low", "Medium", "High"]
education_levels = ["Undergraduate", "Graduate", "MBA", "PhD"]

In [5]:
# Step 6: Generate synthetic dataset

data = []

for i in range(num_records):
    
    book = random.choice(books)
    
    units_sold = random.randint(50, 1000)
    
    price = round(random.uniform(10, 50), 2)
    
    cost = round(price * random.uniform(0.4, 0.7), 2)
    
    revenue = round(units_sold * price, 2)
    
    total_cost = round(units_sold * cost, 2)
    
    profit = round(revenue - total_cost, 2)
    
    profit_margin = round((profit / revenue) * 100, 2)
    
    age = random.choice(age_groups)
    gender = random.choice(genders)
    region = random.choice(regions)
    income = random.choice(income_levels)
    education = random.choice(education_levels)
    
    data.append([
        i+1,
        book,
        units_sold,
        price,
        cost,
        revenue,
        total_cost,
        profit,
        profit_margin,
        age,
        gender,
        region,
        income,
        education
    ])

In [6]:
# Step 7: Create DataFrame

columns = [
    "Sale_ID",
    "Book_Title",
    "Units_Sold",
    "Price",
    "Cost",
    "Revenue",
    "Total_Cost",
    "Profit",
    "Profit_Margin (%)",
    "Age_Group",
    "Gender",
    "Region",
    "Income_Level",
    "Education_Level"
]

df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,Sale_ID,Book_Title,Units_Sold,Price,Cost,Revenue,Total_Cost,Profit,Profit_Margin (%),Age_Group,Gender,Region,Income_Level,Education_Level
0,1,HBR Guide to Better Business Writing,205,43.08,19.36,8831.4,3968.8,4862.6,55.06,60+,Female,Europe,High,MBA
1,2,HBR Guide to Leadership,593,25.01,10.33,14830.93,6125.69,8705.24,58.7,18-25,Male,North America,Medium,Graduate
2,3,HBR Guide to Project Management,665,38.5,16.67,25602.5,11085.55,14516.95,56.7,46-60,Non-Binary,Asia,Medium,PhD
3,4,HBR Guide to Better Business Writing,198,23.3,14.93,4613.4,2956.14,1657.26,35.92,36-45,Male,Europe,High,Graduate
4,5,HBR Guide to Project Management,213,10.8,5.0,2300.4,1065.0,1235.4,53.7,46-60,Male,South America,Medium,Graduate


In [7]:
# Step 8: Dataset info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sale_ID            500 non-null    int64  
 1   Book_Title         500 non-null    object 
 2   Units_Sold         500 non-null    int64  
 3   Price              500 non-null    float64
 4   Cost               500 non-null    float64
 5   Revenue            500 non-null    float64
 6   Total_Cost         500 non-null    float64
 7   Profit             500 non-null    float64
 8   Profit_Margin (%)  500 non-null    float64
 9   Age_Group          500 non-null    object 
 10  Gender             500 non-null    object 
 11  Region             500 non-null    object 
 12  Income_Level       500 non-null    object 
 13  Education_Level    500 non-null    object 
dtypes: float64(6), int64(2), object(6)
memory usage: 54.8+ KB


In [8]:
# Step 9: Summary statistics

df.describe()

Unnamed: 0,Sale_ID,Units_Sold,Price,Cost,Revenue,Total_Cost,Profit,Profit_Margin (%)
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,514.738,30.45686,16.84176,15441.86336,8557.19636,6884.667,44.45678
std,144.481833,281.881898,11.479635,6.748232,10853.5328,6128.730014,5184.540806,8.759794
min,1.0,51.0,10.16,4.47,711.2,376.6,254.8,30.0
25%,125.75,266.75,21.69,11.405,7016.4325,3655.4475,2959.17,36.96
50%,250.5,509.0,30.805,16.85,12428.955,6544.14,5499.84,44.23
75%,375.25,766.25,40.1025,21.9525,22109.5125,12892.59,9127.7175,51.435
max,500.0,999.0,49.99,32.68,47738.55,30392.28,25420.8,59.88


In [9]:
# Step 10: Save dataset

df.to_csv("HBR_Books_Sales_Dataset.csv", index=False)

print("Dataset saved successfully")

Dataset saved successfully


In [10]:
# Step 11: Total Revenue

total_revenue = df["Revenue"].sum()

print("Total Revenue:", total_revenue)

Total Revenue: 7720931.68


In [11]:
# Step 12: Average profit margin

avg_margin = df["Profit_Margin (%)"].mean()

print("Average Profit Margin:", avg_margin)

Average Profit Margin: 44.45678


In [12]:
# Step 13: Sales by Region

region_sales = df.groupby("Region")["Revenue"].sum()

region_sales

Region
Africa           1740855.07
Asia             1660702.91
Europe           1470522.25
North America    1569146.15
South America    1279705.30
Name: Revenue, dtype: float64

In [13]:
# Step 14: Most profitable book

profit_by_book = df.groupby("Book_Title")["Profit"].sum()

profit_by_book.sort_values(ascending=False)

Book_Title
HBR Guide to Leadership                 442033.70
HBR Emotional Intelligence              426682.03
HBR Guide to Strategic Thinking         418782.04
HBR Guide to Project Management         388588.87
HBR Guide to Remote Work                366328.25
HBR Guide to Data Analytics             347022.33
HBR Guide to Finance Basics             308130.22
HBR Guide to Better Business Writing    284463.79
HBR Guide to Managing People            272606.72
HBR Guide to Negotiation                187695.55
Name: Profit, dtype: float64