In [1]:
# Step 2: Import libraries

import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Step 3: Create list of Algorithms books

books = [
    "Introduction to Algorithms",
    "Algorithms Unlocked",
    "The Algorithm Design Manual",
    "Grokking Algorithms",
    "Algorithms in Python",
    "Data Structures and Algorithms Made Easy",
    "Advanced Algorithms",
    "Algorithms Illuminated",
    "Competitive Programming Algorithms",
    "Machine Learning Algorithms Explained"
]

authors = [
    "Thomas H. Cormen",
    "Thomas H. Cormen",
    "Steven Skiena",
    "Aditya Bhargava",
    "Magnus Lie Hetland",
    "Narasimha Karumanchi",
    "Robert Sedgewick",
    "Tim Roughgarden",
    "Steven Halim",
    "Chris Albon"
]

In [3]:
# Step 4: Generate customer demographic data

num_records = 1000

customer_ids = range(1, num_records+1)

ages = np.random.randint(18, 65, num_records)

genders = np.random.choice(["Male", "Female"], num_records)

countries = np.random.choice([
    "UK", "USA", "Canada", "Germany", "India", "Australia"
], num_records)

income = np.random.randint(20000, 120000, num_records)

In [4]:
# Step 5: Generate sales data

book_names = np.random.choice(books, num_records)
book_authors = np.random.choice(authors, num_records)

units_sold = np.random.randint(1, 20, num_records)

price = np.random.uniform(20, 120, num_records).round(2)

cost = (price * np.random.uniform(0.4, 0.7, num_records)).round(2)

revenue = (units_sold * price).round(2)

total_cost = (units_sold * cost).round(2)

profit = (revenue - total_cost).round(2)

profit_margin = ((profit / revenue) * 100).round(2)

In [5]:
# Step 6: Create DataFrame

df = pd.DataFrame({

    "Customer_ID": customer_ids,

    "Age": ages,

    "Gender": genders,

    "Country": countries,

    "Income": income,

    "Book_Name": book_names,

    "Author": book_authors,

    "Units_Sold": units_sold,

    "Price_per_Unit": price,

    "Cost_per_Unit": cost,

    "Revenue": revenue,

    "Total_Cost": total_cost,

    "Profit": profit,

    "Profit_Margin (%)": profit_margin

})

df.head()

Unnamed: 0,Customer_ID,Age,Gender,Country,Income,Book_Name,Author,Units_Sold,Price_per_Unit,Cost_per_Unit,Revenue,Total_Cost,Profit,Profit_Margin (%)
0,1,56,Male,USA,63541,Competitive Programming Algorithms,Steven Skiena,13,20.09,11.28,261.17,146.64,114.53,43.85
1,2,46,Female,USA,43387,The Algorithm Design Manual,Steven Skiena,4,23.93,13.71,95.72,54.84,40.88,42.71
2,3,32,Female,USA,113980,Algorithms Unlocked,Narasimha Karumanchi,14,117.73,69.88,1648.22,978.32,669.9,40.64
3,4,60,Female,Australia,68908,Introduction to Algorithms,Steven Halim,16,44.24,17.92,707.84,286.72,421.12,59.49
4,5,25,Male,USA,105457,The Algorithm Design Manual,Robert Sedgewick,4,86.33,59.4,345.32,237.6,107.72,31.19


In [6]:
# Step 7: Dataset information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Customer_ID        1000 non-null   int64  
 1   Age                1000 non-null   int64  
 2   Gender             1000 non-null   object 
 3   Country            1000 non-null   object 
 4   Income             1000 non-null   int64  
 5   Book_Name          1000 non-null   object 
 6   Author             1000 non-null   object 
 7   Units_Sold         1000 non-null   int64  
 8   Price_per_Unit     1000 non-null   float64
 9   Cost_per_Unit      1000 non-null   float64
 10  Revenue            1000 non-null   float64
 11  Total_Cost         1000 non-null   float64
 12  Profit             1000 non-null   float64
 13  Profit_Margin (%)  1000 non-null   float64
dtypes: float64(6), int64(4), object(4)
memory usage: 109.5+ KB


In [7]:
# Step 8: Summary statistics

df.describe()

Unnamed: 0,Customer_ID,Age,Income,Units_Sold,Price_per_Unit,Cost_per_Unit,Revenue,Total_Cost,Profit,Profit_Margin (%)
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,40.986,68405.429,10.381,70.00263,38.38863,728.38501,400.51881,327.8662,45.23106
std,288.819436,13.497852,29259.35765,5.529595,28.936415,17.292709,527.258975,302.14697,248.854493,8.686121
min,1.0,18.0,20138.0,1.0,20.09,9.14,24.03,11.78,8.9,30.02
25%,250.75,29.0,42053.5,6.0,45.165,23.9,304.8025,162.6825,126.935,37.3975
50%,500.5,42.0,68670.0,11.0,69.15,36.76,615.555,322.89,268.255,45.425
75%,750.25,52.0,94611.5,15.0,94.1875,51.0525,1075.3125,574.44,458.085,52.8025
max,1000.0,64.0,119932.0,19.0,119.94,82.74,2261.57,1439.06,1264.64,59.95


In [8]:
# Step 9: Save dataset

df.to_csv("algorithms_books_sales_dataset.csv", index=False)

print("Dataset saved successfully")

Dataset saved successfully


In [9]:
# Step 10: Total revenue and profit

total_revenue = df["Revenue"].sum()
total_profit = df["Profit"].sum()

print("Total Revenue:", total_revenue)
print("Total Profit:", total_profit)

Total Revenue: 728385.01
Total Profit: 327866.19999999995


In [10]:
# Step 11: Profit by book

profit_by_book = df.groupby("Book_Name")["Profit"].sum()

profit_by_book.sort_values(ascending=False)

Book_Name
Competitive Programming Algorithms          39756.50
Introduction to Algorithms                  36991.16
Data Structures and Algorithms Made Easy    36471.86
Advanced Algorithms                         33544.59
Machine Learning Algorithms Explained       32463.94
The Algorithm Design Manual                 31450.66
Algorithms in Python                        30534.82
Algorithms Unlocked                         29746.96
Grokking Algorithms                         28669.48
Algorithms Illuminated                      28236.23
Name: Profit, dtype: float64

In [11]:
# Step 12: Profit by country

profit_by_country = df.groupby("Country")["Profit"].sum()

profit_by_country

Country
Australia    56302.49
Canada       47605.81
Germany      52887.29
India        57729.96
UK           54306.62
USA          59034.03
Name: Profit, dtype: float64

In [12]:
# Step 13: Create age groups

df["Age_Group"] = pd.cut(
    df["Age"],
    bins=[18,25,35,45,55,65],
    labels=["18-25","26-35","36-45","46-55","56-65"]
)

df.groupby("Age_Group")["Profit"].sum()

  df.groupby("Age_Group")["Profit"].sum()


Age_Group
18-25    46162.75
26-35    61316.99
36-45    73060.36
46-55    78121.95
56-65    59398.18
Name: Profit, dtype: float64