In [0]:
%sql

USE CATALOG samples;
USE tpcds_sf1; 

SHOW TABLES;

SELECT * FROM store_sales LIMIT 5;
SELECT * FROM item LIMIT 5;
SELECT * FROM date_dim LIMIT 5;


d_date_sk,d_date_id,d_date,d_month_seq,d_week_seq,d_quarter_seq,d_year,d_dow,d_moy,d_dom,d_qoy,d_fy_year,d_fy_quarter_seq,d_fy_week_seq,d_day_name,d_quarter_name,d_holiday,d_weekend,d_following_holiday,d_first_dom,d_last_dom,d_same_day_ly,d_same_day_lq,d_current_day,d_current_week,d_current_month,d_current_quarter,d_current_year
2415022,AAAAAAAAOKJNECAA,1900-01-02,0,1,1,1900,1,1,2,1,1900,1,1,Monday,1900Q1,N,N,Y,2415021,2415020,2414657,2414930,N,N,N,N,N
2415023,AAAAAAAAPKJNECAA,1900-01-03,0,1,1,1900,2,1,3,1,1900,1,1,Tuesday,1900Q1,N,N,N,2415021,2415020,2414658,2414931,N,N,N,N,N
2415024,AAAAAAAAALJNECAA,1900-01-04,0,1,1,1900,3,1,4,1,1900,1,1,Wednesday,1900Q1,N,N,N,2415021,2415020,2414659,2414932,N,N,N,N,N
2415025,AAAAAAAABLJNECAA,1900-01-05,0,1,1,1900,4,1,5,1,1900,1,1,Thursday,1900Q1,N,N,N,2415021,2415020,2414660,2414933,N,N,N,N,N
2415026,AAAAAAAACLJNECAA,1900-01-06,0,1,1,1900,5,1,6,1,1900,1,1,Friday,1900Q1,N,Y,N,2415021,2415020,2414661,2414934,N,N,N,N,N


In [0]:
import time

queries = {
    "Q1_revenue_by_category_year": """
        SELECT
          i_category,
          d_year,
          SUM(ss_net_paid) AS total_revenue
        FROM store_sales   ss
        JOIN item          i ON ss.ss_item_sk = i.i_item_sk
        JOIN date_dim      d ON ss.ss_sold_date_sk = d.d_date_sk
        WHERE d.d_year BETWEEN 2000 AND 2002
        GROUP BY i_category, d_year
        ORDER BY d_year, total_revenue DESC
        LIMIT 20
    """,

    "Q2_top_customers": """
        SELECT
          c_customer_id,
          c_first_name,
          c_last_name,
          SUM(ss_net_paid) AS total_spend
        FROM store_sales ss
        JOIN customer    c ON ss.ss_customer_sk = c.c_customer_sk
        GROUP BY c_customer_id, c_first_name, c_last_name
        ORDER BY total_spend DESC
        LIMIT 20
    """,

    "Q3_daily_revenue_2001": """
        SELECT
          d_date,
          SUM(ss_net_paid) AS daily_revenue
        FROM store_sales ss
        JOIN date_dim   d ON ss.ss_sold_date_sk = d.d_date_sk
        WHERE d.d_year = 2001
        GROUP BY d_date
        ORDER BY d_date
    """,
    "Q4_hard_mixed_channel":"""
    WITH all_sales AS (
    SELECT
        'store' AS channel,
        ss_sold_date_sk  AS sold_date_sk,
        ss_item_sk       AS item_sk,
        ss_customer_sk   AS customer_sk,
        ss_net_paid      AS net_paid
    FROM store_sales

    UNION ALL

    SELECT
        'web' AS channel,
        ws_sold_date_sk  AS sold_date_sk,
        ws_item_sk       AS item_sk,
        ws_bill_customer_sk AS customer_sk,
        ws_net_paid      AS net_paid
    FROM web_sales

    UNION ALL

    SELECT
        'catalog' AS channel,
        cs_sold_date_sk  AS sold_date_sk,
        cs_item_sk       AS item_sk,
        cs_bill_customer_sk AS customer_sk,
        cs_net_paid      AS net_paid
    FROM catalog_sales
    ),

    sales_enriched AS (
    SELECT
        a.channel,
        d.d_year,
        d.d_moy,
        i.i_category,
        i.i_brand,
        c.c_customer_id,
        SUM(a.net_paid) AS revenue
    FROM all_sales a
    JOIN date_dim d
        ON a.sold_date_sk = d.d_date_sk
    JOIN item i
        ON a.item_sk = i.i_item_sk
    JOIN customer c
        ON a.customer_sk = c.c_customer_sk
    WHERE d.d_year BETWEEN 1999 AND 2002
    GROUP BY
        a.channel,
        d.d_year,
        d.d_moy,
        i.i_category,
        i.i_brand,
        c.c_customer_id
    ),

    ranked AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
        PARTITION BY channel, d_year, i_category
        ORDER BY revenue DESC
        ) AS customer_rank
    FROM sales_enriched
    )

    SELECT
    channel,
    d_year,
    d_moy,
    i_category,
    i_brand,
    COUNT(*) AS num_customer_buckets,
    SUM(CASE WHEN customer_rank <= 10 THEN revenue END) AS top10_revenue,
    SUM(revenue) AS total_revenue,
    (SUM(revenue) - SUM(CASE WHEN customer_rank <= 10 THEN revenue END)) AS non_top10_revenue
    FROM ranked
    GROUP BY
    channel,
    d_year,
    d_moy,
    i_category,
    i_brand
    ORDER BY
    channel,
    d_year,
    d_moy,
    i_category,
    i_brand
    """
}


In [0]:
def run_query(name, sql, repetitions=3, warmup=True):
    #test for warn run
    if warmup:
        spark.sql(sql).limit(1).collect()

    times = []
    for i in range(repetitions):
        start = time.time()
        spark.sql(sql).collect()
        elapsed = time.time() - start
        times.append(elapsed)

    avg_time = sum(times) / len(times)
    print(f"\n{name}")
    print(f"  Runs: {['%.2f' % t for t in times]}")
    print(f"  Avg:  {avg_time:.2f} s")
    return avg_time


results = {}
for name, sql in queries.items():
    results[name] = run_query(name, sql, repetitions=3)

print("\n=== Summary (seconds) ===")
for name, avg in results.items():
    print(f"{name:30s}  {avg:.2f}")



Q1_revenue_by_category_year
  Runs: ['1.95', '1.88', '1.82']
  Avg:  1.88 s

Q2_top_customers
  Runs: ['1.81', '2.17', '1.89']
  Avg:  1.96 s

Q3_daily_revenue_2001
  Runs: ['2.12', '1.87', '1.54']
  Avg:  1.84 s

Q4_hard_mixed_channel
  Runs: ['7.82', '6.55', '6.48']
  Avg:  6.95 s

=== Summary (seconds) ===
Q1_revenue_by_category_year     1.88
Q2_top_customers                1.96
Q3_daily_revenue_2001           1.84
Q4_hard_mixed_channel           6.95
