In [1]:
import numpy as np
import duckdb

In [2]:
duckdb.sql("CREATE TABLE brands AS SELECT * FROM 'bike_store_sample_database/brands.csv'")
duckdb.sql("CREATE TABLE categories AS SELECT * FROM 'bike_store_sample_database/categories.csv'")
duckdb.sql("CREATE TABLE customers AS SELECT * FROM 'bike_store_sample_database/customers.csv'")
duckdb.sql("CREATE TABLE order_items AS SELECT * FROM 'bike_store_sample_database/order_items.csv'")
duckdb.sql("CREATE TABLE orders AS SELECT * FROM 'bike_store_sample_database/orders.csv'")
duckdb.sql("CREATE TABLE products AS SELECT * FROM 'bike_store_sample_database/products.csv'")
duckdb.sql("CREATE TABLE staffs AS SELECT * FROM 'bike_store_sample_database/staffs.csv'")
duckdb.sql("CREATE TABLE stocks AS SELECT * FROM 'bike_store_sample_database/stocks.csv'")
duckdb.sql("CREATE TABLE stores AS SELECT * FROM 'bike_store_sample_database/stores.csv'")

In [3]:
duckdb.sql("SHOW TABLES")

┌─────────────┐
│    name     │
│   varchar   │
├─────────────┤
│ brands      │
│ categories  │
│ customers   │
│ order_items │
│ orders      │
│ products    │
│ staffs      │
│ stocks      │
│ stores      │
└─────────────┘

### Which brands sold the highest total quantity of products?

In [4]:
query1 = '''
WITH product_quantities AS (
SELECT oi.product_id, 
	quantity, 
    p.brand_id, 
    brand_name
FROM order_items oi
JOIN products p
	ON oi.product_id = p.product_id
JOIN brands b
	ON p.brand_id = b.brand_id
)
SELECT brand_name, 
	SUM(quantity) as total_quantity_sales
FROM product_quantities
GROUP BY brand_name
ORDER BY SUM(quantity) DESC
;
'''

duckdb.sql(query1)

┌──────────────┬──────────────────────┐
│  brand_name  │ total_quantity_sales │
│   varchar    │        int128        │
├──────────────┼──────────────────────┤
│ Electra      │                 2612 │
│ Trek         │                 1839 │
│ Surly        │                  908 │
│ Sun Bicycles │                  731 │
│ Pure Cycles  │                  376 │
│ Haro         │                  331 │
│ Heller       │                  138 │
│ Ritchey      │                  118 │
│ Strider      │                   25 │
└──────────────┴──────────────────────┘

### Which brands generated the highest total revenue?

In [5]:
query2 = '''
WITH order_products_id AS (
SELECT oi.product_id, 
	quantity, 
    oi.list_price, 
    discount, 
    p.brand_id, 
    brand_name
FROM order_items oi
JOIN products p
	ON oi.product_id = p.product_id
JOIN brands b
	ON p.brand_id = b.brand_id
),
compute_revenue AS (
SELECT *,
	quantity * (list_price * (1 - discount)) AS revenue
FROM order_products_id
)
SELECT brand_name, 
	ROUND(SUM(revenue), 2) as total_revenue
FROM compute_revenue
GROUP BY brand_name
ORDER BY SUM(revenue) DESC
;
'''

duckdb.sql(query2)

┌──────────────┬───────────────┐
│  brand_name  │ total_revenue │
│   varchar    │    double     │
├──────────────┼───────────────┤
│ Trek         │    4602754.35 │
│ Electra      │    1205320.82 │
│ Surly        │     949507.06 │
│ Sun Bicycles │     341994.93 │
│ Haro         │     185384.55 │
│ Heller       │     171459.08 │
│ Pure Cycles  │     149476.34 │
│ Ritchey      │      78898.95 │
│ Strider      │       4320.48 │
└──────────────┴───────────────┘

### How much of the customers are retained (i.e. new vs. repeat customers)?

In [6]:
query3 = '''
WITH get_customer_status AS (
SELECT c.customer_id, 
	first_name, 
    last_name,
	CASE WHEN COUNT(DISTINCT o.order_id) > 1 THEN 'Repeat'
	ELSE 'New'
    END AS customer_status
FROM customers c
JOIN orders o
	ON o.customer_id = c.customer_id
GROUP BY c.customer_id, first_name, last_name
)
SELECT customer_status, 
	COUNT(customer_status) AS amount
FROM get_customer_status
GROUP BY customer_status
;
'''

duckdb.sql(query3)

┌─────────────────┬────────┐
│ customer_status │ amount │
│     varchar     │ int64  │
├─────────────────┼────────┤
│ Repeat          │    131 │
│ New             │   1314 │
└─────────────────┴────────┘

### For each store, how many customers are new vs. repeat, and what percentage are repeat customers?

In [7]:
query4 = '''
WITH get_store_customer_status AS (
SELECT c.customer_id, 
	s.store_name,
	CASE WHEN COUNT(DISTINCT o.order_id) > 1 THEN 'Repeat'
	ELSE 'New'
    END AS customer_status
FROM customers c
JOIN orders o
	ON o.customer_id = c.customer_id
JOIN stores s
	ON o.store_id = s.store_id
GROUP BY c.customer_id, s.store_name
)
SELECT store_name, 
	COUNT(CASE WHEN customer_status = 'Repeat' THEN 1 END) AS repeat_customers,
	COUNT(CASE WHEN customer_status = 'New' THEN 1 END) AS new_customers,
	COUNT(*) AS total_customers,
	ROUND(100.0 * COUNT(CASE WHEN customer_status = 'Repeat' THEN 1 END) / COUNT(*) , 2) AS percent_repeat_customers
FROM get_store_customer_status
GROUP BY store_name
ORDER BY 4 DESC
;
'''

duckdb.sql(query4)

┌──────────────────┬──────────────────┬───────────────┬─────────────────┬──────────────────────────┐
│    store_name    │ repeat_customers │ new_customers │ total_customers │ percent_repeat_customers │
│     varchar      │      int64       │     int64     │      int64      │          double          │
├──────────────────┼──────────────────┼───────────────┼─────────────────┼──────────────────────────┤
│ Baldwin Bikes    │               60 │           959 │            1019 │                     5.89 │
│ Santa Cruz Bikes │               52 │           232 │             284 │                    18.31 │
│ Rowlett Bikes    │               19 │           123 │             142 │                    13.38 │
└──────────────────┴──────────────────┴───────────────┴─────────────────┴──────────────────────────┘

### Which brands gave the highest total discounts, and how many units did they sell?

In [8]:
query5 = '''
WITH get_discount AS (
SELECT oi.product_id, 
	quantity, 
    oi.list_price, 
    discount, 
    p.brand_id, 
    brand_name,
	quantity * (oi.list_price * oi.discount) AS discount_in_usd
FROM order_items oi
JOIN products p
	ON oi.product_id = p.product_id
JOIN brands b
	ON p.brand_id = b.brand_id
)
SELECT brand_name, 
	SUM(quantity) AS total_quantity_sales, 
	ROUND(SUM(discount_in_usd), 2) AS total_discount
FROM get_discount
GROUP BY brand_name
ORDER BY SUM(discount_in_usd) DESC
;
'''

duckdb.sql(query5)

┌──────────────┬──────────────────────┬────────────────┐
│  brand_name  │ total_quantity_sales │ total_discount │
│   varchar    │        int128        │     double     │
├──────────────┼──────────────────────┼────────────────┤
│ Trek         │                 1839 │      526627.26 │
│ Electra      │                 2612 │      138822.97 │
│ Surly        │                  908 │      113628.76 │
│ Sun Bicycles │                  731 │       39924.76 │
│ Heller       │                  138 │       22339.63 │
│ Haro         │                  331 │       21712.14 │
│ Pure Cycles  │                  376 │       16687.66 │
│ Ritchey      │                  118 │        9599.87 │
│ Strider      │                   25 │         529.27 │
└──────────────┴──────────────────────┴────────────────┘

### What is the annual revenue per store?

In [9]:
query6 = '''
WITH get_year_sales AS (
SELECT o.order_id, 
	EXTRACT(YEAR FROM order_date) AS year, 
    quantity, 
    list_price, 
    discount, 
    store_name
FROM orders o
JOIN order_items oi
	ON o.order_id = oi.order_id
JOIN stores s
	ON o.store_id = s.store_id
)
SELECT year, 
	ROUND(SUM(CASE WHEN store_name = 'Baldwin Bikes' THEN quantity * (list_price * (1 - discount)) ELSE 0 END), 2) AS Baldwin_Bikes_revenue,
	ROUND(SUM(CASE WHEN store_name = 'Rowlett Bikes' THEN quantity * (list_price * (1 - discount)) ELSE 0 END), 2) AS Rowlett_Bikes_revenue,
	ROUND(SUM(CASE WHEN store_name = 'Santa Cruz Bikes' THEN quantity * (list_price * (1 - discount)) ELSE 0 END), 2) AS SantaCruz_Bikes_revenue
FROM get_year_sales
GROUP BY year
;
'''

duckdb.sql(query6)

┌───────┬───────────────────────┬───────────────────────┬─────────────────────────┐
│ year  │ Baldwin_Bikes_revenue │ Rowlett_Bikes_revenue │ SantaCruz_Bikes_revenue │
│ int64 │        double         │        double         │         double          │
├───────┼───────────────────────┼───────────────────────┼─────────────────────────┤
│  2016 │            1590598.88 │             271286.14 │               565493.51 │
│  2017 │             2479688.4 │             404611.61 │               562908.24 │
│  2018 │             1145464.0 │              191644.5 │               477421.29 │
└───────┴───────────────────────┴───────────────────────┴─────────────────────────┘

### What are the top 5 categories by revenue in each store?

In [10]:
query7 = '''
WITH get_category_sales AS (
SELECT oi.product_id, 
	oi.quantity, 
    oi.list_price, 
    discount, 
	oi.quantity * (oi.list_price * (1 - discount)) AS revenue,
	category_name, 
    store_name
FROM orders o
JOIN order_items oi 
	ON oi.order_id = o.order_id
JOIN products p 
	ON oi.product_id = p.product_id
JOIN categories cat
	ON p.category_id = cat.category_id
JOIN stores s
	ON o.store_id = s.store_id
),
category_total_revenue AS (
SELECT store_name, 
	category_name, 
	SUM(revenue) AS total_revenue
FROM get_category_sales
GROUP BY store_name, category_name
),
rank_categories AS (
SELECT store_name, 
	ROW_NUMBER() OVER(PARTITION BY store_name ORDER BY total_revenue DESC) AS category_rank,
	category_name, 
    ROUND(total_revenue, 2)
FROM category_total_revenue
)
SELECT *
FROM rank_categories
WHERE category_rank <= 5
ORDER BY store_name
;
'''

duckdb.sql(query7)

┌──────────────────┬───────────────┬─────────────────────┬─────────────────────────┐
│    store_name    │ category_rank │    category_name    │ round(total_revenue, 2) │
│     varchar      │     int64     │       varchar       │         double          │
├──────────────────┼───────────────┼─────────────────────┼─────────────────────────┤
│ Baldwin Bikes    │             5 │ Cyclocross Bicycles │               487774.56 │
│ Baldwin Bikes    │             1 │ Mountain Bikes      │              1836392.23 │
│ Baldwin Bikes    │             2 │ Road Bikes          │              1140477.21 │
│ Baldwin Bikes    │             3 │ Cruisers Bicycles   │               681795.77 │
│ Baldwin Bikes    │             4 │ Electric Bikes      │               602829.78 │
│ Rowlett Bikes    │             1 │ Mountain Bikes      │               316439.67 │
│ Rowlett Bikes    │             2 │ Road Bikes          │               217271.01 │
│ Rowlett Bikes    │             3 │ Cruisers Bicycles   │       

### Which customers made the most orders?

### Which customers spent the most?

### What is the average order value (AOV) per customer?

### What is the average discount per category?

### Which products have the highest revenue per unit sold (premium items)?

## Advanced

### Who are the top 10 customers by lifetime value (LTV)?

A basic calculation of LTV involves multiplying the average purchase value by the average number of purchases per year and then by the average customer lifespan.

### Which product categories are most common in repeat orders?