## Import libraries.

In [316]:
%load_ext pydough.jupyter_extensions

import pydough
import datetime
import pandas as pd
import dfcompare
import sqlite3 as sql

connection = sql.connect('tpch.db')
cursor = connection.cursor()

The pydough.jupyter_extensions extension is already loaded. To reload it, use:
  %reload_ext pydough.jupyter_extensions


## Load database.

In [317]:
pydough.active_session.load_metadata_graph("../metadata/tpch_demo_graph.json", "TPCH");
pydough.active_session.connect_database("sqlite", database="../../tpch.db");
pd.options.display.float_format = '{:.6f}'.format

In [165]:
graph = pydough.active_session.metadata

print(pydough.explain_structure(graph))

Structure of PyDough graph: TPCH

  customers
  ├── acctbal
  ├── address
  ├── comment
  ├── key
  ├── mktsegment
  ├── name
  ├── nation_key
  ├── phone
  ├── nation [one member of nations] (reverse of nations.customers)
  └── orders [multiple orders] (reverse of orders.customer)

  lines
  ├── comment
  ├── commit_date
  ├── discount
  ├── extended_price
  ├── line_number
  ├── order_key
  ├── part_key
  ├── quantity
  ├── receipt_date
  ├── return_flag
  ├── ship_date
  ├── ship_instruct
  ├── ship_mode
  ├── status
  ├── supplier_key
  ├── tax
  ├── order [one member of orders] (reverse of orders.lines)
  ├── part [one member of parts] (reverse of parts.lines)
  ├── part_and_supplier [one member of supply_records] (reverse of supply_records.lines)
  └── supplier [one member of suppliers] (reverse of suppliers.lines)

  nations
  ├── comment
  ├── key
  ├── name
  ├── region_key
  ├── customers [multiple customers] (reverse of customers.nation)
  ├── region [one member of regions] 

# TPCH QUERIES (BACK OVERHAUL)

## 1. Find All Customers in the Asia Region




In [183]:
query = '''
SELECT c_custkey, c_name, c_address 
FROM customer c
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE r_name = 'ASIA';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS
0,7,Customer#000000007,TcGe5gaZNgVePxU5kRrvXBfkasDTea
1,9,Customer#000000009,xKiAFTjUsCuxfeleNqefumTrjS
2,19,Customer#000000019,"uc,3bHIx84H,wdrmLOjVsiqXCq2tr"
3,21,Customer#000000021,XYmVpr9yAHDEn
4,25,Customer#000000025,Hp8GyFQgGHFYSilH5tBfe
...,...,...,...
30178,149981,Customer#000149981,NXsAQ7ptlZzRFp
30179,149984,Customer#000149984,ZBEyUfjRsVtUNSIv9dnnyoPYeQwi7czgCeeeM
30180,149987,Customer#000149987,P6z8nSIgW55cSydfa1bZ
30181,149989,Customer#000149989,"0uSL 8qBRsNylw6e,sUlSrqGy497GR0z"


In [83]:
%%pydough

filter_c = nations.WHERE(region.name == "ASIA").customers.CALCULATE(
    c_custkey=key,
    c_name=name,
    c_address=address,
)

output= pydough.to_df(filter_c)
output

Unnamed: 0,c_custkey,c_name,c_address
0,7,Customer#000000007,TcGe5gaZNgVePxU5kRrvXBfkasDTea
1,9,Customer#000000009,xKiAFTjUsCuxfeleNqefumTrjS
2,19,Customer#000000019,"uc,3bHIx84H,wdrmLOjVsiqXCq2tr"
3,21,Customer#000000021,XYmVpr9yAHDEn
4,25,Customer#000000025,Hp8GyFQgGHFYSilH5tBfe
...,...,...,...
30178,149981,Customer#000149981,NXsAQ7ptlZzRFp
30179,149984,Customer#000149984,ZBEyUfjRsVtUNSIv9dnnyoPYeQwi7czgCeeeM
30180,149987,Customer#000149987,P6z8nSIgW55cSydfa1bZ
30181,149989,Customer#000149989,"0uSL 8qBRsNylw6e,sUlSrqGy497GR0z"


In [84]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 2. Find the total number of orders placed in a specific year




In [85]:
query = '''
SELECT COUNT(*) AS total_orders 
FROM orders o 
WHERE strftime('%Y', o.o_orderdate) = '1998';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,total_orders
0,133623


In [86]:
%%pydough

orders_1998= TPCH.CALCULATE(total_orders=COUNT(orders.WHERE(YEAR(order_date) == 1998)))

output= pydough.to_df(orders_1998)
output

Unnamed: 0,total_orders
0,133623


In [87]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 3.  Total revenue for each customer region.




In [88]:
query = '''
SELECT r_name, SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM region r
JOIN nation n ON r_regionkey = n_regionkey
JOIN customer c ON n_nationkey = c_nationkey
JOIN orders o ON c_custkey = o_custkey
JOIN lineitem l ON o_orderkey = l_orderkey
GROUP BY r_name
ORDER BY total_revenue DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,R_NAME,total_revenue
0,EUROPE,44032702326.2956
1,ASIA,43858010644.9379
2,AMERICA,43565312628.9458
3,AFRICA,43488870851.6861
4,MIDDLE EAST,43157327433.1347


In [None]:
%%pydough

line_info = lines.CALCULATE(
    region_name=order.customer.nation.region.name,
    revenue=extended_price * (1 - discount),
)

output = PARTITION(line_info, name="l", by=(region_name)).CALCULATE(
    REGION=region_name,
    REVENUE= SUM(l.revenue)
).ORDER_BY(
    REVENUE.DESC(),
)

output= pydough.to_df(output)
output


Unnamed: 0,REGION,REVENUE
0,EUROPE,44032702326.2956
1,ASIA,43858010644.9379
2,AMERICA,43565312628.9458
3,AFRICA,43488870851.6861
4,MIDDLE EAST,43157327433.1347


In [90]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 4. Find the customer who placed the most orders in a specific year




In [91]:
query = '''
SELECT o.o_custkey, c.c_name, COUNT(o.o_orderkey) AS order_count
FROM orders o
JOIN customer c ON o.o_custkey = c.c_custkey
WHERE strftime('%Y', o.o_orderdate) = '1992'
GROUP BY o.o_custkey
ORDER BY order_count DESC
LIMIT 1;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,O_CUSTKEY,C_NAME,order_count
0,64303,Customer#000064303,14


In [92]:
%%pydough

line_info = customers.CALCULATE(
    key,
    region_name=name,
    num_orders=COUNT(
        orders.WHERE(YEAR(order_date) == 1992)
    ),
)
output = PARTITION(line_info, name="l", by=(key,region_name, num_orders)).CALCULATE(
    key,
    REGION=region_name,
    NUM_ORDERS=num_orders
).TOP_K(1,
    NUM_ORDERS.DESC(),
)

output= pydough.to_df(output)
output

Unnamed: 0,key,REGION,NUM_ORDERS
0,64303,Customer#000064303,14


In [93]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 5. Find the top 5 customers with the highest total order value:




In [94]:
query = '''
SELECT c_custkey, c_name, SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN lineitem l ON o.o_orderkey = l.l_orderkey
GROUP BY c_custkey, c_name
ORDER BY total_revenue DESC
LIMIT 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,total_revenue
0,143500,Customer#000143500,6757566.0218
1,95257,Customer#000095257,6294115.334
2,87115,Customer#000087115,6184649.5176
3,131113,Customer#000131113,6080943.8305
4,134380,Customer#000134380,6075141.9635


In [176]:
%%pydough

line_info = lines.CALCULATE(
    customer_key= order.customer.key,
    customer_name=order.customer.name,
    revenue=extended_price * (1 - discount),
)
output = PARTITION(line_info, name="l", by=(customer_key,customer_name)).CALCULATE(
    customer_name=customer_name,
    customer_key= customer_key,
    total_revenue=SUM(l.revenue)
).TOP_K(5, by=total_revenue.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,customer_name,customer_key,total_revenue
0,Customer#000143500,143500,6757566.0218
1,Customer#000095257,95257,6294115.334
2,Customer#000087115,87115,6184649.5176
3,Customer#000131113,131113,6080943.8305
4,Customer#000134380,134380,6075141.9635


In [96]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.True_

## 6. Number of Orders per Customer made in 1995




In [None]:
query = '''
SELECT
    c.c_custkey,
    c.c_name,
    COUNT(o.o_orderkey) AS num_orders
FROM
    customer c
JOIN orders o ON c.c_custkey = o.o_custkey
WHERE
    o.o_orderdate >= DATE('1995-01-01') 
    AND o.o_orderdate < DATE('1996-01-01') 
GROUP BY
    c.c_custkey, c.c_name
ORDER BY
    num_orders DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,num_orders
0,63733,Customer#000063733,12
1,107440,Customer#000107440,12
2,115471,Customer#000115471,12
3,120877,Customer#000120877,12
4,14920,Customer#000014920,11
...,...,...,...
86568,149981,Customer#000149981,1
86569,149984,Customer#000149984,1
86570,149986,Customer#000149986,1
86571,149987,Customer#000149987,1


In [252]:
%%pydough

selected_lines = orders.WHERE(
    (YEAR(order_date) == 1995)
).CALCULATE(key=key)

output = customers.CALCULATE(
    C_CUSTKEY=key,
    C_NAME=name,
    NUM_ORDERS=COUNT(selected_lines.key),
).WHERE(NUM_ORDERS > 0).ORDER_BY(NUM_ORDERS.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,C_CUSTKEY,C_NAME,NUM_ORDERS
0,63733,Customer#000063733,12
1,107440,Customer#000107440,12
2,115471,Customer#000115471,12
3,120877,Customer#000120877,12
4,14920,Customer#000014920,11
...,...,...,...
86568,149981,Customer#000149981,1
86569,149984,Customer#000149984,1
86570,149986,Customer#000149986,1
86571,149987,Customer#000149987,1


In [253]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 7(real). Identify suppliers who have never supplied any parts

In [319]:
query = '''
SELECT s.s_suppkey, s.s_name
FROM supplier s
LEFT JOIN partsupp ps ON s.s_suppkey = ps.ps_suppkey
WHERE ps.ps_suppkey IS NULL;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_SUPPKEY,S_NAME


In [321]:
%%pydough

result = suppliers.WHERE(
    HASNOT(supply_records) 
).CALCULATE(
    suppkey=key,
    name=name
)

pydough.to_df(result)

Unnamed: 0,suppkey,name


## 7. Determine the number of orders placed in each month of a year. 




In [273]:
query = '''
SELECT
    strftime('%m', o_orderdate) AS order_month,
    COUNT(o_orderkey) AS num_orders            
FROM
    orders
WHERE
    o_orderdate >= '1998-01-01'  
    AND o_orderdate < '1999-01-01'
GROUP BY
    order_month
ORDER BY
    order_month;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [274]:
%%pydough

selected_lines = orders.CALCULATE(key=key, o_month=MONTH(order_date)).WHERE(
    (order_date >= datetime.date(1998, 1, 1))
    & (order_date < datetime.date(1999, 1, 1))
)

output = PARTITION(selected_lines, name="o", by=(o_month)).CALCULATE(
    o_month=o_month,
    num_orders= COUNT(o.key)
)

output= pydough.to_df(output)
output

Unnamed: 0,o_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [275]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

## 8. Retrieve the names and comments of nations whose names start with the letter "A"

In [103]:
query = '''
SELECT N_NAME, N_COMMENT
FROM nation
WHERE N_NAME LIKE 'A%';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,N_COMMENT
0,ALGERIA,haggle. carefully final deposits detect slyly...
1,ARGENTINA,al foxes promise slyly according to the regula...


In [104]:
%%pydough

nations_startwith= nations.CALCULATE(n_name=name, n_comment= comment).WHERE(STARTSWITH(name,'A'))

nations_like= nations.CALCULATE(n_name=name, n_comment= comment).WHERE(LIKE(name,'A%'))

output= pydough.to_df(nations_like)
output

Unnamed: 0,n_name,n_comment
0,ALGERIA,haggle. carefully final deposits detect slyly...
1,ARGENTINA,al foxes promise slyly according to the regula...


In [105]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 9. Retrieve the names of customers who are from Peru. 




In [256]:
query = '''
SELECT C.C_NAME
FROM customer C
JOIN nation N
ON C.C_NATIONKEY = N.N_NATIONKEY
WHERE N.N_NAME = 'PERU';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_NAME
0,Customer#000000008
1,Customer#000000033
2,Customer#000000035
3,Customer#000000061
4,Customer#000000077
...,...
5970,Customer#000149914
5971,Customer#000149928
5972,Customer#000149939
5973,Customer#000149948


In [254]:
%%pydough

customers_from_peru = customers.WHERE(nation.name == "PERU").CALCULATE(c_name=name)

output= pydough.to_df(customers_from_peru)
output

Unnamed: 0,c_name
0,Customer#000000008
1,Customer#000000033
2,Customer#000000035
3,Customer#000000061
4,Customer#000000077
...,...
5970,Customer#000149914
5971,Customer#000149928
5972,Customer#000149939
5973,Customer#000149948


In [257]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 10.  Retrieve the customer IDs and names of customers who have a negative account balance, are not from Brazil, live in the Americas region, and have placed more than 5 orders. 




In [290]:
query = '''
SELECT c.c_custkey, c.c_name
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE c.c_acctbal < 0
  AND n.n_name != 'BRAZIL'
  AND r.r_name = 'AMERICA'
GROUP BY c.c_custkey, c.c_name
HAVING COUNT(o.o_orderkey) > 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME
0,64,Customer#000000064
1,478,Customer#000000478
2,488,Customer#000000488
3,632,Customer#000000632
4,872,Customer#000000872
...,...,...
1412,149812,Customer#000149812
1413,149815,Customer#000149815
1414,149831,Customer#000149831
1415,149890,Customer#000149890


In [293]:
%%pydough 

customer_in_debt = customers.CALCULATE(
    c_id=key,
    c_name=name,
).WHERE(
    (acctbal < 0) &  
    (COUNT(orders.key) > 5) &  
    (nation.region.name == "AMERICA") & 
    (nation.name != "BRAZIL") 
)

output = pydough.to_df(customer_in_debt)
output


Unnamed: 0,c_id,c_name
0,64,Customer#000000064
1,478,Customer#000000478
2,488,Customer#000000488
3,632,Customer#000000632
4,872,Customer#000000872
...,...,...
1412,149812,Customer#000149812
1413,149815,Customer#000149815
1414,149831,Customer#000149831
1415,149890,Customer#000149890


In [294]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 11. Find the total number of orders per customers placed in 1998




In [278]:
query = '''
SELECT
    c.c_custkey,
    c.c_name,
    COUNT(o.o_orderkey) AS num_orders
FROM
    customer c
JOIN orders o ON c.c_custkey = o.o_custkey
WHERE
    strftime('%Y', o.o_orderdate) = '1998'  
GROUP BY
    c.c_custkey, c.c_name
ORDER BY
    num_orders DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,num_orders
0,11719,Customer#000011719,9
1,93778,Customer#000093778,9
2,102295,Customer#000102295,9
3,111394,Customer#000111394,9
4,4789,Customer#000004789,8
...,...,...,...
71052,149971,Customer#000149971,1
71053,149977,Customer#000149977,1
71054,149981,Customer#000149981,1
71055,149990,Customer#000149990,1


In [281]:
%%pydough

selected_orders = orders.WHERE(
    (YEAR(order_date) == 1998)
).CALCULATE(key=key)

output = customers.CALCULATE(
    C_CUSTKEY=key,
    C_NAME=name,
    NUM_ORDERS=COUNT(selected_lines.key),
).WHERE(NUM_ORDERS > 0).ORDER_BY(NUM_ORDERS.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,C_CUSTKEY,C_NAME,NUM_ORDERS
0,11719,Customer#000011719,9
1,93778,Customer#000093778,9
2,102295,Customer#000102295,9
3,111394,Customer#000111394,9
4,4789,Customer#000004789,8
...,...,...,...
71052,149971,Customer#000149971,1
71053,149977,Customer#000149977,1
71054,149981,Customer#000149981,1
71055,149990,Customer#000149990,1


In [282]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 12.  List the names of nations and the count of orders placed by customers from each nation, ordered by the number of orders in descending order.




In [115]:
query = '''
SELECT n.n_name, COUNT(o.o_orderkey) AS order_count
FROM nation n
JOIN customer c ON n.n_nationkey = c.c_nationkey
JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY n.n_name
ORDER BY order_count DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,order_count
0,FRANCE,61600
1,RUSSIA,61495
2,INDONESIA,61377
3,MOZAMBIQUE,61267
4,ROMANIA,61012
5,CHINA,60784
6,JORDAN,60736
7,CANADA,60480
8,VIETNAM,60347
9,BRAZIL,60137


In [116]:
%%pydough

orders_by_nation = orders.CALCULATE(
    o_keys=key,
    region_name=customer.nation.name 
)

grouped_orders = PARTITION(
    orders_by_nation, name="o", by=region_name
).CALCULATE(
    region_name,
    orders_count=COUNT(o.o_keys)
).ORDER_BY(orders_count.DESC())

output= pydough.to_df(grouped_orders)
output


Unnamed: 0,region_name,orders_count
0,FRANCE,61600
1,RUSSIA,61495
2,INDONESIA,61377
3,MOZAMBIQUE,61267
4,ROMANIA,61012
5,CHINA,60784
6,JORDAN,60736
7,CANADA,60480
8,VIETNAM,60347
9,BRAZIL,60137


In [117]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 13.  List the number of orders placed each month in the year 1998, ordered by month.

In [302]:
query = '''
SELECT
    strftime('%m', o_orderdate) AS order_month,
    COUNT(o_orderkey) AS num_orders            
FROM
    orders
WHERE
    o_orderdate >= '1998-01-01'  
    AND o_orderdate < '1999-01-01'
GROUP BY
    order_month
ORDER BY
    order_month;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [300]:
%%pydough

selected_lines = orders.CALCULATE(key=key,  o_month=MONTH(order_date)).WHERE(
    (YEAR(order_date) == 1998)
)

output = PARTITION(selected_lines, name="o", by=(o_month)).CALCULATE(
    order_month=o_month,
    num_orders= COUNT(o.key)
)

output=pydough.to_df(output)
output


Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [301]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

## 14. Identify the customer IDs, names, and total spending of customers from the Asia region who have spent more than 1000 in total on orders.




In [121]:
query = '''
SELECT c.c_custkey, c.c_name, SUM(o.o_totalprice) AS total_spent
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE r.r_name = 'ASIA'
GROUP BY c.c_custkey, c.c_name
HAVING SUM(o.o_totalprice) > 1000;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,total_spent
0,7,Customer#000000007,2957861.160000
1,19,Customer#000000019,3611713.600000
2,25,Customer#000000025,3135039.320000
3,28,Customer#000000028,2429022.210000
4,37,Customer#000000037,2860377.420000
...,...,...,...
20019,149980,Customer#000149980,3115223.230000
20020,149981,Customer#000149981,1700503.960000
20021,149984,Customer#000149984,1153164.880000
20022,149987,Customer#000149987,472026.460000


In [122]:
%%pydough

filter_c= customers.CALCULATE(
        c_key= key,
        c_name=name,
        TOTAL_PRICE=SUM(orders.total_price)
       ).WHERE((TOTAL_PRICE > 1000 ) & (nation.region.name == "ASIA"))

output=pydough.to_df(filter_c)
output

Unnamed: 0,c_key,c_name,TOTAL_PRICE
0,7,Customer#000000007,2957861.160000
1,19,Customer#000000019,3611713.600000
2,25,Customer#000000025,3135039.320000
3,28,Customer#000000028,2429022.210000
4,37,Customer#000000037,2860377.420000
...,...,...,...
20019,149980,Customer#000149980,3115223.230000
20020,149981,Customer#000149981,1700503.960000
20021,149984,Customer#000149984,1153164.880000
20022,149987,Customer#000149987,472026.460000


In [123]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 15. Calculate the average order value for each region.




In [124]:
query = '''
SELECT 
    r.r_name AS Region, 
    AVG(o.o_totalprice) AS AvgOrderValue 
FROM 
    orders o
JOIN 
    customer c ON o.o_custkey = c.c_custkey
JOIN 
    nation n ON c.c_nationkey = n.n_nationkey
JOIN 
    region r ON n.n_regionkey = r.r_regionkey
GROUP BY 
    r.r_name;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,Region,AvgOrderValue
0,AFRICA,151274.687459
1,AMERICA,151476.057596
2,ASIA,151167.942741
3,EUROPE,150990.370343
4,MIDDLE EAST,151192.10578


In [125]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_region_name=nation.region.name
).orders.CALCULATE(
    orders_price=total_price
)

output = PARTITION(selected_customers, "cust", by=customer_region_name).CALCULATE(
    REGION_NAME=customer_region_name,
    TOTALREVENUE= AVG(cust.orders_price)
)

output=pydough.to_df(output)
output


Unnamed: 0,REGION_NAME,TOTALREVENUE
0,AFRICA,151274.687459
1,AMERICA,151476.057596
2,ASIA,151167.942741
3,EUROPE,150990.370343
4,MIDDLE EAST,151192.10578


In [126]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 16. Find the top 5 regions with the highest total revenue from orders. 




In [127]:
query = '''
SELECT 
    r.r_name AS RegionName, 
    SUM(o.o_totalprice) AS TotalRevenue
FROM 
    region r
JOIN nation n ON r.r_regionkey = n.n_regionkey
JOIN customer c ON n.n_nationkey = c.c_nationkey
JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY 
    r.r_name
ORDER BY 
    TotalRevenue DESC
LIMIT 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,RegionName,TotalRevenue
0,EUROPE,45793265459.71
1,ASIA,45613415042.56
2,AMERICA,45306943255.21
3,AFRICA,45230223902.22
4,MIDDLE EAST,44885458787.76


In [128]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_region_name=nation.region.name
).orders.CALCULATE(
    orders_price=total_price
)

output = PARTITION(selected_customers, "cust", by=customer_region_name).CALCULATE(
    REGION_NAME=customer_region_name,
    TOTALREVENUE= SUM(cust.orders_price)
).TOP_K(5, by=TOTALREVENUE.DESC())

output=pydough.to_df(output)
output

Unnamed: 0,REGION_NAME,TOTALREVENUE
0,EUROPE,45793265459.71
1,ASIA,45613415042.56
2,AMERICA,45306943255.21
3,AFRICA,45230223902.22
4,MIDDLE EAST,44885458787.76


In [129]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 17. For each region and nation, calculate the maximum and minimum order values, the difference between them, and the total number of orders, ordered by the order value difference in descending order.




In [130]:
query = '''
SELECT 
    r.r_name AS region_name,
    n.n_name AS nation_name,
    MAX(o.o_totalprice) AS max_order_value,
    MIN(o.o_totalprice) AS min_order_value,
    MAX(o.o_totalprice) - MIN(o.o_totalprice) AS order_value_difference,
    COUNT(o.o_orderkey) AS total_orders
FROM region r
JOIN nation n ON r.r_regionkey = n.n_regionkey  
JOIN customer c ON c.c_nationkey = n.n_nationkey
JOIN orders o ON o.o_custkey = c.c_custkey
GROUP BY r.r_name, n.n_name
ORDER BY order_value_difference DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,nation_name,max_order_value,min_order_value,order_value_difference,total_orders
0,EUROPE,RUSSIA,555285.16,932.41,554352.75,61495
1,AMERICA,PERU,544089.09,891.74,543197.35,59018
2,AMERICA,ARGENTINA,530604.44,877.3,529727.14,59547
3,AMERICA,UNITED STATES,525590.57,913.45,524677.12,59921
4,MIDDLE EAST,IRAN,522644.48,924.51,521719.97,59675
5,AMERICA,CANADA,515531.82,908.18,514623.64,60480
6,EUROPE,FRANCE,508668.52,885.75,507782.77,61600
7,AFRICA,MOZAMBIQUE,508047.99,896.59,507151.4,61267
8,ASIA,VIETNAM,504509.06,911.67,503597.39,60347
9,ASIA,JAPAN,502742.76,857.71,501885.05,59405


In [131]:
%%pydough

selected_orders = customers.CALCULATE(
    region_name=nation.region.name,
    nation_name=nation.name
).orders.CALCULATE(
    total_price=total_price
)


output = PARTITION(selected_orders, name="o", by=(region_name, nation_name)).CALCULATE(
    region_name,
    nation_name,
    max_order_value=MAX(o.total_price),
    min_order_value=MIN(o.total_price),
    order_value_difference=MAX(o.total_price) - MIN(o.total_price),
    total_orders=COUNT(o.total_price)
).ORDER_BY(order_value_difference.DESC())

output=pydough.to_df(output)
output


Unnamed: 0,region_name,nation_name,max_order_value,min_order_value,order_value_difference,total_orders
0,EUROPE,RUSSIA,555285.16,932.41,554352.75,61495
1,AMERICA,PERU,544089.09,891.74,543197.35,59018
2,AMERICA,ARGENTINA,530604.44,877.3,529727.14,59547
3,AMERICA,UNITED STATES,525590.57,913.45,524677.12,59921
4,MIDDLE EAST,IRAN,522644.48,924.51,521719.97,59675
5,AMERICA,CANADA,515531.82,908.18,514623.64,60480
6,EUROPE,FRANCE,508668.52,885.75,507782.77,61600
7,AFRICA,MOZAMBIQUE,508047.99,896.59,507151.4,61267
8,ASIA,VIETNAM,504509.06,911.67,503597.39,60347
9,ASIA,JAPAN,502742.76,857.71,501885.05,59405


In [132]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 18. List the nations and the count of customers in the "Machinery" and "Automobile" market segments, ordered by the number of customers in descending order.




In [133]:
query = '''
SELECT 
    n.n_name AS nation_name,
    COUNT(c.c_custkey) AS customer_count
FROM nation n
JOIN customer c ON c.c_nationkey = n.n_nationkey
WHERE c.c_mktsegment IN ('MACHINERY', 'AUTOMOBILE') 
GROUP BY n.n_name
ORDER BY customer_count DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,nation_name,customer_count
0,ROMANIA,2545
1,INDONESIA,2489
2,CHINA,2481
3,ETHIOPIA,2423
4,BRAZIL,2419
5,RUSSIA,2414
6,EGYPT,2414
7,GERMANY,2402
8,UNITED STATES,2399
9,JORDAN,2397


In [134]:
%%pydough

selected_customers = customers.WHERE(ISIN(mktsegment, ('MACHINERY', 'AUTOMOBILE'))).CALCULATE(
    nation_name=nation.name, 
    key=key)

output= PARTITION(selected_customers, name="cust", by=(nation_name)).CALCULATE(
    nation_name= nation_name,
    customer_count= COUNT(cust.key)
).ORDER_BY(customer_count.DESC())

output=pydough.to_df(output)
output


Unnamed: 0,nation_name,customer_count
0,ROMANIA,2545
1,INDONESIA,2489
2,CHINA,2481
3,ETHIOPIA,2423
4,BRAZIL,2419
5,EGYPT,2414
6,RUSSIA,2414
7,GERMANY,2402
8,UNITED STATES,2399
9,JORDAN,2397


In [135]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.True_

## 19. Calculate the percentage of high-priority orders (e.g., '1-URGENT', '2-HIGH') for each region.
 




In [136]:
query = '''
SELECT r.r_name AS region_name, 
  ROUND(
    SUM(
      CASE 
        WHEN o.o_orderpriority IN ('1-URGENT', '2-HIGH') THEN 1 
        ELSE 0 
      END
    ) * 100.0 / COUNT(o.o_orderkey),
    2
  ) AS high_priority_percentage
  
FROM orders o
JOIN customer c ON o.o_custkey = c.c_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
GROUP BY r.r_name
ORDER BY high_priority_percentage DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,high_priority_percentage
0,MIDDLE EAST,40.2
1,AMERICA,40.16
2,EUROPE,39.99
3,ASIA,39.91
4,AFRICA,39.89


In [137]:
%%pydough

selected_orders = customers.CALCULATE(
    region_name=nation.region.name
).orders.CALCULATE(
    key,
    is_prioritary=IFF(ISIN(order_priority, ('1-URGENT', '2-HIGH')), 1, 0)
)

output = PARTITION(selected_orders, name="o", by=region_name).CALCULATE(
    region_name,
    high_priority_percentage=ROUND((SUM(o.is_prioritary) * 100) / COUNT(o.key), 2)
).ORDER_BY(high_priority_percentage.DESC())

output=pydough.to_df(output)
output



Unnamed: 0,region_name,high_priority_percentage
0,MIDDLE EAST,40.2
1,AMERICA,40.16
2,EUROPE,39.99
3,ASIA,39.91
4,AFRICA,39.89


In [138]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 20.  Customers Who Have Never Placed Orders




In [139]:
query = '''
SELECT c.c_custkey, c.c_name
FROM customer c
LEFT JOIN orders o ON c.c_custkey = o.o_custkey
WHERE o.o_orderkey IS NULL;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME
0,3,Customer#000000003
1,6,Customer#000000006
2,9,Customer#000000009
3,12,Customer#000000012
4,15,Customer#000000015
...,...,...
49999,149988,Customer#000149988
50000,149991,Customer#000149991
50001,149994,Customer#000149994
50002,149997,Customer#000149997


In [140]:
%%pydough

customers_without_orders= customers.WHERE(HASNOT(orders)==1).CALCULATE(key, name)

output=pydough.to_df(customers_without_orders)
output

Unnamed: 0,key,name
0,3,Customer#000000003
1,6,Customer#000000006
2,9,Customer#000000009
3,12,Customer#000000012
4,15,Customer#000000015
...,...,...
49999,149988,Customer#000149988
50000,149991,Customer#000149991
50001,149994,Customer#000149994
50002,149997,Customer#000149997


In [141]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 21. How many total, active, and inactive customers are there in each nation, sorted by the total number of customers?




In [142]:
query = '''
SELECT
    n.n_name,
    COUNT(DISTINCT c.c_custkey) AS total_customers,
    COUNT(DISTINCT CASE WHEN o.o_orderkey IS NOT NULL THEN c.c_custkey END) AS active_customers,
    COUNT(DISTINCT CASE WHEN o.o_orderkey IS NULL THEN c.c_custkey END) AS inactive_customers
FROM
    nation n
JOIN customer c ON n.n_nationkey = c.c_nationkey
LEFT JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY n.n_name
ORDER BY total_customers DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,total_customers,active_customers,inactive_customers
0,INDONESIA,6161,4081,2080
1,ROMANIA,6100,4087,2013
2,FRANCE,6100,4149,1951
3,RUSSIA,6078,4089,1989
4,INDIA,6042,3958,2084
5,JORDAN,6033,4025,2008
6,CHINA,6024,4011,2013
7,CANADA,6020,4006,2014
8,UNITED KINGDOM,6011,3989,2022
9,IRAN,6009,4013,1996


In [143]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_nation_name= nation.name, 
    active_customers=KEEP_IF(key,HAS(orders)),
    inactive_customers= KEEP_IF(key, HASNOT(orders))
)
output = PARTITION(selected_customers, "cust", by=customer_nation_name).CALCULATE(
    NATION_NAME=customer_nation_name,
    TOTAL_CUSTOMERS= COUNT(cust.key),
    ACTIVE_CUSTOMERS=NDISTINCT(cust.active_customers),
    INACTIVE_CUSTOMERS=NDISTINCT(cust.inactive_customers),
).ORDER_BY(TOTAL_CUSTOMERS.DESC())

output=pydough.to_df(output)
output

Unnamed: 0,NATION_NAME,TOTAL_CUSTOMERS,ACTIVE_CUSTOMERS,INACTIVE_CUSTOMERS
0,INDONESIA,6161,4081,2080
1,FRANCE,6100,4149,1951
2,ROMANIA,6100,4087,2013
3,RUSSIA,6078,4089,1989
4,INDIA,6042,3958,2084
5,JORDAN,6033,4025,2008
6,CHINA,6024,4011,2013
7,CANADA,6020,4006,2014
8,UNITED KINGDOM,6011,3989,2022
9,IRAN,6009,4013,1996


In [144]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

## 22. Retrieve customers who belong to the top 10% in account balance but rank in the bottom 25% in terms of order activity

In [307]:
query = '''
SELECT c_name, c_acctbal
FROM (
    SELECT 
        c.c_name,
        c.c_acctbal,
        PERCENT_RANK() OVER (ORDER BY c.c_acctbal DESC) AS balance_percentile,
        PERCENT_RANK() OVER (ORDER BY COUNT(o.o_orderkey)) AS order_activity_percentile
    FROM customer c
    LEFT JOIN orders o ON c.c_custkey = o.o_custkey
    GROUP BY c.c_custkey, c.c_name, c.c_acctbal
) sub
WHERE 
    balance_percentile <= 0.1  
    AND order_activity_percentile <= 0.25 
ORDER BY c_acctbal DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,c_name,c_acctbal
0,Customer#000069321,9999.960000
1,Customer#000002487,9999.720000
2,Customer#000043044,9999.490000
3,Customer#000076146,9999.230000
4,Customer#000034047,9998.970000
...,...,...
4941,Customer#000115446,8894.600000
4942,Customer#000082611,8894.490000
4943,Customer#000013560,8894.430000
4944,Customer#000078429,8894.390000


In [None]:
%%pydough

customer_orders = customers.CALCULATE(
    key,
    name,
    acctbal,
    num_orders=COUNT(orders.key)  
)

selected_customers = customer_orders.CALCULATE(
    key,
    name,
    acctbal,
    balance_percentile=PERCENTILE(by=acctbal.DESC()),  
    order_activity_percentile=PERCENTILE(by=num_orders.ASC())
).WHERE(
    (balance_percentile <= 10) & (order_activity_percentile <= 25)
).ORDER_BY(acctbal.DESC())

output = pydough.to_df(selected_customers)
output




Unnamed: 0,key,name,acctbal,balance_percentile,order_activity_percentile
0,69321,Customer#000069321,9999.960000,1,16
1,2487,Customer#000002487,9999.720000,1,1
2,43044,Customer#000043044,9999.490000,1,10
3,76146,Customer#000076146,9999.230000,1,17
4,34047,Customer#000034047,9998.970000,1,8
...,...,...,...,...,...
3708,62682,Customer#000062682,8894.780000,10,14
3709,82611,Customer#000082611,8894.490000,10,19
3710,13560,Customer#000013560,8894.430000,10,4
3711,78429,Customer#000078429,8894.390000,10,18


In [156]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

False

## 23. Which region has the highest total supply cost, considering the supply cost and available quantity for suppliers across different nations?


In [148]:
query = '''
SELECT r_name AS region_name, SUM(ps_supplycost * ps_availqty) AS total_supply_cost
FROM region
JOIN nation ON r_regionkey = n_regionkey
JOIN supplier ON n_nationkey = s_nationkey
JOIN partsupp ON s_suppkey = ps_suppkey
GROUP BY r_name
ORDER BY total_supply_cost DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,total_supply_cost
0,AMERICA,407942718701.84
1,MIDDLE EAST,405058070978.87
2,ASIA,400599873546.96
3,EUROPE,397934639557.59
4,AFRICA,392074106221.66


In [149]:
%%pydough

supply= supply_records.CALCULATE(
    region_name= supplier.nation.region.name, 
    supply_cost= supplycost, 
    availqty= availqty
)

supply_cost_by_regions= PARTITION(supply, name="supp", by=region_name).CALCULATE(
    region_name,
    total_supply_cost= SUM(supp.supplycost * supp.availqty)
).ORDER_BY(total_supply_cost.DESC())


output=pydough.to_df(supply_cost_by_regions)
output

Unnamed: 0,region_name,total_supply_cost
0,AMERICA,407942718701.84
1,MIDDLE EAST,405058070978.87
2,ASIA,400599873546.96
3,EUROPE,397934639557.59
4,AFRICA,392074106221.66


In [150]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 1. 




## 1. 


