Task 4

In [0]:
%sql
----------------
---- Task 4 ----
----------------

-- SUM number of sales and revenue by state
SELECT 
    c.cust_address_state_province AS state,
    COUNT(f.order_id) AS total_sales,
    SUM(f.unit_price * f.quantity) AS total_revenue
FROM de_pyspark_training_catalog.buddy_group_1.amanolov_gold_fact_orders_exam f
JOIN de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_customers_exam c
    ON f.cust_sk = c.cust_sk
GROUP BY c.cust_address_state_province
ORDER BY total_sales DESC;

state,total_sales,total_revenue
WI,44,9139.96
NY,41,9059.19
MN,16,3032.3400000000006
PA,8,1297.52
MI,7,1323.48
IA,4,1250.46
MD,3,872.6000000000001
IN,3,515.6800000000001


In [0]:
%sql

---------------
---- Task 4 ---
---------------

-- Top 3 customers for each state based on the spent amount and their most expensive item bought
WITH customer_sales AS (
    SELECT 
        c.cust_address_state_province AS state,
        c.cust_sk,
        c.cust_first_name,
        c.cust_last_name,
        SUM(f.unit_price * f.quantity) AS total_spent,
        MAX(f.unit_price) AS most_expensive_item
    FROM de_pyspark_training_catalog.buddy_group_1.amanolov_gold_fact_orders_exam f
    JOIN de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_customers_exam c
        ON f.cust_sk = c.cust_sk
    GROUP BY c.cust_address_state_province, c.cust_sk, c.cust_first_name, c.cust_last_name
),

ranked_customers AS (
    SELECT 
        state,
        cust_first_name,
        cust_last_name,
        total_spent,
        most_expensive_item,
        ROW_NUMBER() OVER (PARTITION BY state ORDER BY total_spent DESC) AS rank
    FROM customer_sales
)

SELECT 
    state,
    cust_first_name,
    cust_last_name,
    total_spent,
    most_expensive_item
FROM ranked_customers
WHERE rank <= 3
ORDER BY state, rank;

state,cust_first_name,cust_last_name,total_spent,most_expensive_item
IA,Sivaji,Landis,1250.46,68.15
IN,Constantin,Welles,365.6,91.4
IN,Harrison,Pacino,150.07999999999998,35.0
MD,Prem,Walken,872.6000000000001,93.31
MI,Meg,Derek,816.26,45.44
MI,Prem,Garcia,317.84,79.46
MI,Kyle,Schneider,189.38,94.69
MN,Hema,Powell,1084.67,68.15
MN,Dheeraj,Alexander,582.0,25.91
MN,Harry Dean,Fonda,443.19,35.0


In [0]:
%sql

---------------
---- Task 4 ---
---------------

-- Sales by product category per month for all customers
-- who live in the state of NY for 3 different age group 0-30yo, 30-60yo and above 60 yo

WITH customers_ny AS (
    SELECT 
        cust_sk,
        cust_first_name,
        cust_last_name,
        date_of_birth,
        cust_address_state_province,
        FLOOR(DATEDIFF(CURRENT_DATE, date_of_birth) / 365.25) AS age
    FROM de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_customers_exam
    WHERE cust_address_state_province = 'NY'
),

customers_with_age_group AS (
    SELECT *,
           CASE 
               WHEN age <= 30 THEN '0-30'
               WHEN age > 30 AND age <= 60 THEN '31-60'
               ELSE '61+'
           END AS age_group
    FROM customers_ny
),

sales_per_category AS (
    SELECT 
        DATE_FORMAT(f.order_date, 'yyyy-MM') AS order_month,
        p.category_name,
        c.age_group,
        SUM(CAST(f.unit_price AS DOUBLE) * CAST(f.quantity AS DOUBLE)) AS total_sales
    FROM de_pyspark_training_catalog.buddy_group_1.amanolov_gold_fact_orders_exam f
    JOIN de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_products_exam p
        ON f.product_sk = p.product_sk
    JOIN customers_with_age_group c
        ON f.cust_sk = c.cust_sk
    WHERE f.order_date IS NOT NULL 
    GROUP BY order_month, p.category_name, c.age_group
)

SELECT * 
FROM sales_per_category
ORDER BY order_month, category_name, age_group;

