## Import libraries.

In [1]:
%load_ext pydough.jupyter_extensions

import pydough
import datetime
import pandas as pd
import dfcompare
import sqlite3 as sql

connection = sql.connect('tpch.db')
cursor = connection.cursor()

## Load database.

In [2]:
pydough.active_session.load_metadata_graph("../metadata/tpch_demo_graph.json", "TPCH");
pydough.active_session.connect_database("sqlite", database="../../tpch.db");
pd.options.display.float_format = '{:.6f}'.format

In [9]:
graph = pydough.active_session.metadata

print(pydough.explain_structure(graph))

Structure of PyDough graph: TPCH

  customers
  ├── acctbal
  ├── address
  ├── comment
  ├── key
  ├── mktsegment
  ├── name
  ├── nation_key
  ├── phone
  ├── nation [one member of nations] (reverse of nations.customers)
  └── orders [multiple orders] (reverse of orders.customer)

  lines
  ├── comment
  ├── commit_date
  ├── discount
  ├── extended_price
  ├── line_number
  ├── order_key
  ├── part_key
  ├── quantity
  ├── receipt_date
  ├── return_flag
  ├── ship_date
  ├── ship_instruct
  ├── ship_mode
  ├── status
  ├── supplier_key
  ├── tax
  ├── order [one member of orders] (reverse of orders.lines)
  ├── part [one member of parts] (reverse of parts.lines)
  ├── part_and_supplier [one member of supply_records] (reverse of supply_records.lines)
  └── supplier [one member of suppliers] (reverse of suppliers.lines)

  nations
  ├── comment
  ├── key
  ├── name
  ├── region_key
  ├── customers [multiple customers] (reverse of customers.nation)
  ├── region [one member of regions] 

# TPCH QUERIES (BACK OVERHAUL)

## 1. Find All Customers in the Asia Region




In [10]:
query = '''
SELECT c_custkey, c_name, c_address 
FROM customer c
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE r_name = 'ASIA';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS
0,7,Customer#000000007,TcGe5gaZNgVePxU5kRrvXBfkasDTea
1,9,Customer#000000009,xKiAFTjUsCuxfeleNqefumTrjS
2,19,Customer#000000019,"uc,3bHIx84H,wdrmLOjVsiqXCq2tr"
3,21,Customer#000000021,XYmVpr9yAHDEn
4,25,Customer#000000025,Hp8GyFQgGHFYSilH5tBfe
...,...,...,...
30178,149981,Customer#000149981,NXsAQ7ptlZzRFp
30179,149984,Customer#000149984,ZBEyUfjRsVtUNSIv9dnnyoPYeQwi7czgCeeeM
30180,149987,Customer#000149987,P6z8nSIgW55cSydfa1bZ
30181,149989,Customer#000149989,"0uSL 8qBRsNylw6e,sUlSrqGy497GR0z"


In [11]:
%%pydough

filter_c = nations.WHERE(region.name == "ASIA").customers.CALCULATE(
    custkey=key,
    name=name,
    address=address,
)

output= pydough.to_df(filter_c)
output

Unnamed: 0,custkey,name,address
0,7,Customer#000000007,TcGe5gaZNgVePxU5kRrvXBfkasDTea
1,9,Customer#000000009,xKiAFTjUsCuxfeleNqefumTrjS
2,19,Customer#000000019,"uc,3bHIx84H,wdrmLOjVsiqXCq2tr"
3,21,Customer#000000021,XYmVpr9yAHDEn
4,25,Customer#000000025,Hp8GyFQgGHFYSilH5tBfe
...,...,...,...
30178,149981,Customer#000149981,NXsAQ7ptlZzRFp
30179,149984,Customer#000149984,ZBEyUfjRsVtUNSIv9dnnyoPYeQwi7czgCeeeM
30180,149987,Customer#000149987,P6z8nSIgW55cSydfa1bZ
30181,149989,Customer#000149989,"0uSL 8qBRsNylw6e,sUlSrqGy497GR0z"


In [12]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 2. Find the total number of orders placed in a specific year




In [13]:
query = '''
SELECT COUNT(*) AS total_orders 
FROM orders o 
WHERE strftime('%Y', o.o_orderdate) = '1998';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,total_orders
0,133623


In [14]:
%%pydough

orders_1998= TPCH.CALCULATE(total_orders=COUNT(orders.WHERE(YEAR(order_date) == 1998)))

output= pydough.to_df(orders_1998)
output

Unnamed: 0,total_orders
0,133623


In [15]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 3.  Total revenue for each customer region.




In [16]:
query = '''
SELECT r_name, SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM region r
JOIN nation n ON r_regionkey = n_regionkey
JOIN customer c ON n_nationkey = c_nationkey
JOIN orders o ON c_custkey = o_custkey
JOIN lineitem l ON o_orderkey = l_orderkey
GROUP BY r_name
ORDER BY total_revenue DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,R_NAME,total_revenue
0,EUROPE,44032702326.2956
1,ASIA,43858010644.9379
2,AMERICA,43565312628.9458
3,AFRICA,43488870851.6861
4,MIDDLE EAST,43157327433.1347


In [17]:
%%pydough

line_info = lines.CALCULATE(
    region_name=order.customer.nation.region.name,
    revenue=extended_price * (1 - discount),
)

output = PARTITION(line_info, name="l", by=(region_name)).CALCULATE(
    REGION=region_name,
    REVENUE= SUM(l.revenue)
).ORDER_BY(
    REVENUE.DESC(),
)

output= pydough.to_df(output)
output


Unnamed: 0,REGION,REVENUE
0,EUROPE,44032702326.2956
1,ASIA,43858010644.9379
2,AMERICA,43565312628.9458
3,AFRICA,43488870851.6861
4,MIDDLE EAST,43157327433.1347


In [18]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 4. Find the customer who placed the most orders in a specific year




In [19]:
query = '''
SELECT o.o_custkey, c.c_name, COUNT(o.o_orderkey) AS order_count
FROM orders o
JOIN customer c ON o.o_custkey = c.c_custkey
WHERE strftime('%Y', o.o_orderdate) = '1992'
GROUP BY o.o_custkey
ORDER BY order_count DESC
LIMIT 1;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,O_CUSTKEY,C_NAME,order_count
0,64303,Customer#000064303,14


In [20]:
%%pydough

line_info = customers.CALCULATE(
    key,
    region_name=name,
    num_orders=COUNT(
        orders.WHERE(YEAR(order_date) == 1992)
    ),
)
output = PARTITION(line_info, name="l", by=(key,region_name, num_orders)).CALCULATE(
    key,
    REGION=region_name,
    NUM_ORDERS=num_orders
).TOP_K(1,
    NUM_ORDERS.DESC(),
)

output= pydough.to_df(output)
output

Unnamed: 0,key,REGION,NUM_ORDERS
0,64303,Customer#000064303,14


In [21]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 5. Find the top 5 customers with the highest total order value:




In [22]:
query = '''
SELECT c_custkey, c_name, SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN lineitem l ON o.o_orderkey = l.l_orderkey
GROUP BY c_custkey, c_name
ORDER BY total_revenue DESC
LIMIT 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,total_revenue
0,143500,Customer#000143500,6757566.0218
1,95257,Customer#000095257,6294115.334
2,87115,Customer#000087115,6184649.5176
3,131113,Customer#000131113,6080943.8305
4,134380,Customer#000134380,6075141.9635


In [23]:
%%pydough

line_info = lines.CALCULATE(
    customer_key= order.customer.key,
    customer_name=order.customer.name,
    revenue=extended_price * (1 - discount),
)
output = PARTITION(line_info, name="l", by=(customer_key,customer_name)).CALCULATE(
    customer_name=customer_name,
    customer_key= customer_key,
    total_revenue=SUM(l.revenue)
).TOP_K(5, by=total_revenue.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,customer_name,customer_key,total_revenue
0,Customer#000143500,143500,6757566.0218
1,Customer#000095257,95257,6294115.334
2,Customer#000087115,87115,6184649.5176
3,Customer#000131113,131113,6080943.8305
4,Customer#000134380,134380,6075141.9635


In [24]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.True_

## 6. Number of Orders per Customer made in 1995




In [25]:
query = '''
SELECT
    c.c_custkey,
    c.c_name,
    COUNT(o.o_orderkey) AS num_orders
FROM
    customer c
JOIN orders o ON c.c_custkey = o.o_custkey
WHERE
    o.o_orderdate >= DATE('1995-01-01') 
    AND o.o_orderdate < DATE('1996-01-01') 
GROUP BY
    c.c_custkey, c.c_name
ORDER BY
    num_orders DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,num_orders
0,63733,Customer#000063733,12
1,107440,Customer#000107440,12
2,115471,Customer#000115471,12
3,120877,Customer#000120877,12
4,14920,Customer#000014920,11
...,...,...,...
86568,149981,Customer#000149981,1
86569,149984,Customer#000149984,1
86570,149986,Customer#000149986,1
86571,149987,Customer#000149987,1


In [26]:
%%pydough

selected_lines = orders.WHERE(
    (YEAR(order_date) == 1995)
).CALCULATE(key=key)

output = customers.CALCULATE(
    C_CUSTKEY=key,
    C_NAME=name,
    NUM_ORDERS=COUNT(selected_lines.key),
).WHERE(NUM_ORDERS > 0).ORDER_BY(NUM_ORDERS.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,C_CUSTKEY,C_NAME,NUM_ORDERS
0,63733,Customer#000063733,12
1,107440,Customer#000107440,12
2,115471,Customer#000115471,12
3,120877,Customer#000120877,12
4,14920,Customer#000014920,11
...,...,...,...
86568,149981,Customer#000149981,1
86569,149984,Customer#000149984,1
86570,149986,Customer#000149986,1
86571,149987,Customer#000149987,1


In [27]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 7(real). Identify suppliers who have never supplied any parts

In [10]:
query = '''
SELECT s.s_suppkey, s.s_name
FROM supplier s
LEFT JOIN partsupp ps ON s.s_suppkey = ps.ps_suppkey
WHERE ps.ps_suppkey IS NULL;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_SUPPKEY,S_NAME


In [None]:
%%pydough

result = suppliers.WHERE(
    HASNOT(supply_records) 
).CALCULATE(
    suppkey=key,
    name=name
)

pydough.to_df(result)

Unnamed: 0,suppkey,name


## 7. Determine the number of orders placed in each month of a year. 




In [30]:
query = '''
SELECT
    strftime('%m', o_orderdate) AS order_month,
    COUNT(o_orderkey) AS num_orders            
FROM
    orders
WHERE
    o_orderdate >= '1998-01-01'  
    AND o_orderdate < '1999-01-01'
GROUP BY
    order_month
ORDER BY
    order_month;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [31]:
%%pydough

selected_lines = orders.CALCULATE(key=key, o_month=MONTH(order_date)).WHERE(
    (order_date >= datetime.date(1998, 1, 1))
    & (order_date < datetime.date(1999, 1, 1))
)

output = PARTITION(selected_lines, name="o", by=(o_month)).CALCULATE(
    o_month=o_month,
    num_orders= COUNT(o.key)
)

output= pydough.to_df(output)
output

Unnamed: 0,o_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [32]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

## 8. Retrieve the names and comments of nations whose names start with the letter "A"

In [33]:
query = '''
SELECT N_NAME, N_COMMENT
FROM nation
WHERE N_NAME LIKE 'A%';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,N_COMMENT
0,ALGERIA,haggle. carefully final deposits detect slyly...
1,ARGENTINA,al foxes promise slyly according to the regula...


In [34]:
%%pydough

nations_startwith= nations.CALCULATE(n_name=name, n_comment= comment).WHERE(STARTSWITH(name,'A'))

nations_like= nations.CALCULATE(n_name=name, n_comment= comment).WHERE(LIKE(name,'A%'))

output= pydough.to_df(nations_like)
output

Unnamed: 0,n_name,n_comment
0,ALGERIA,haggle. carefully final deposits detect slyly...
1,ARGENTINA,al foxes promise slyly according to the regula...


In [35]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 9(real). Orders Shipped Late in 1998 - Customer Details

In [18]:
query = '''
SELECT *
FROM lineitem l
JOIN orders o ON l.l_orderkey = o.o_orderkey
JOIN customer c ON o.o_custkey = c.c_custkey
WHERE l.l_shipdate < l.l_commitdate
  AND l.l_commitdate < l.l_receiptdate
  AND l.l_receiptdate >= '1998-01-01'
  AND l.l_receiptdate < '1999-01-01';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,L_ORDERKEY,L_PARTKEY,L_SUPPKEY,L_LINENUMBER,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT,L_TAX,L_RETURNFLAG,L_LINESTATUS,...,O_SHIPPRIORITY,O_COMMENT,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
0,68,82758,5267,5,27,47000.250000,0.030000,0.060000,N,O,...,0,pinto beans sleep carefully. blithely ironic ...,28547,Customer#000028547,"AeWmD3BLrsSkmRY7O,wbB75i6Ll",1,11-711-951-5798,2095.420000,MACHINERY,"y regular foxes nag quickly after the express,..."
1,68,139247,1761,7,41,52735.840000,0.090000,0.080000,N,O,...,0,pinto beans sleep carefully. blithely ironic ...,28547,Customer#000028547,"AeWmD3BLrsSkmRY7O,wbB75i6Ll",1,11-711-951-5798,2095.420000,MACHINERY,"y regular foxes nag quickly after the express,..."
2,71,34432,1942,3,45,61489.350000,0.000000,0.070000,N,O,...,0,express deposits along the blithely regul,3373,Customer#000003373,WUCr1BPpcY7u,24,34-132-612-5205,-546.880000,BUILDING,"counts are blithely. requests wake silent, bol..."
3,100,62029,2030,1,28,27748.560000,0.040000,0.050000,N,O,...,0,heodolites detect slyly alongside of the ent,147004,Customer#000147004,",5lO7OHiDTFNK6t1HuLmIyQalgXDNgVH tytO9h",6,16-416-345-9278,9658.210000,FURNITURE,blithely unusual instructions. blithely expres...
4,192,97017,2036,1,23,23322.230000,0.000000,0.000000,N,O,...,0,y unusual platelets among the final instructio...,82570,Customer#000082570,Xh K FGxw7,16,26-271-558-1374,6420.880000,HOUSEHOLD,ely above the carefully final deposits. quickl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84969,5999907,131873,4387,4,33,62860.710000,0.030000,0.060000,N,O,...,0,en deposits. even d,131032,Customer#000131032,ciGI0b8M1rSPncIEIS4caXJ2jS9RPNwKmUjJug,24,34-402-606-1848,6161.300000,BUILDING,"bold, regular warthogs cajole furi"
84970,5999907,179207,6759,5,30,38586.000000,0.000000,0.060000,N,O,...,0,en deposits. even d,131032,Customer#000131032,ciGI0b8M1rSPncIEIS4caXJ2jS9RPNwKmUjJug,24,34-402-606-1848,6161.300000,BUILDING,"bold, regular warthogs cajole furi"
84971,5999939,12036,9540,2,22,20856.660000,0.050000,0.070000,N,O,...,0,", regular foxes nag along",79657,Customer#000079657,"ifonC8aiuHQiwN4xvHZF,C7mRBYkHF9nTPusT3Y",11,21-957-935-4695,1184.790000,MACHINERY,ss deposits sleep carefully according to the f...
84972,5999939,84779,2304,3,23,40566.710000,0.100000,0.010000,N,O,...,0,", regular foxes nag along",79657,Customer#000079657,"ifonC8aiuHQiwN4xvHZF,C7mRBYkHF9nTPusT3Y",11,21-957-935-4695,1184.790000,MACHINERY,ss deposits sleep carefully according to the f...


In [19]:
%%pydough


selected_lines = lines.WHERE(
    (ship_date < commit_date)
    & (commit_date < receipt_date)
    & (receipt_date >= datetime.date(1998, 1, 1))
    & (receipt_date < datetime.date(1999, 1, 1))
)

output = selected_lines.order.customer

output= pydough.to_df(output)
output

Unnamed: 0,key,name,address,nation_key,phone,acctbal,mktsegment,comment
0,28547,Customer#000028547,"AeWmD3BLrsSkmRY7O,wbB75i6Ll",1,11-711-951-5798,2095.420000,MACHINERY,"y regular foxes nag quickly after the express,..."
1,28547,Customer#000028547,"AeWmD3BLrsSkmRY7O,wbB75i6Ll",1,11-711-951-5798,2095.420000,MACHINERY,"y regular foxes nag quickly after the express,..."
2,3373,Customer#000003373,WUCr1BPpcY7u,24,34-132-612-5205,-546.880000,BUILDING,"counts are blithely. requests wake silent, bol..."
3,147004,Customer#000147004,",5lO7OHiDTFNK6t1HuLmIyQalgXDNgVH tytO9h",6,16-416-345-9278,9658.210000,FURNITURE,blithely unusual instructions. blithely expres...
4,82570,Customer#000082570,Xh K FGxw7,16,26-271-558-1374,6420.880000,HOUSEHOLD,ely above the carefully final deposits. quickl...
...,...,...,...,...,...,...,...,...
84969,131032,Customer#000131032,ciGI0b8M1rSPncIEIS4caXJ2jS9RPNwKmUjJug,24,34-402-606-1848,6161.300000,BUILDING,"bold, regular warthogs cajole furi"
84970,131032,Customer#000131032,ciGI0b8M1rSPncIEIS4caXJ2jS9RPNwKmUjJug,24,34-402-606-1848,6161.300000,BUILDING,"bold, regular warthogs cajole furi"
84971,79657,Customer#000079657,"ifonC8aiuHQiwN4xvHZF,C7mRBYkHF9nTPusT3Y",11,21-957-935-4695,1184.790000,MACHINERY,ss deposits sleep carefully according to the f...
84972,79657,Customer#000079657,"ifonC8aiuHQiwN4xvHZF,C7mRBYkHF9nTPusT3Y",11,21-957-935-4695,1184.790000,MACHINERY,ss deposits sleep carefully according to the f...


In [20]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

False

## 9. Retrieve the names of customers who are from Peru. 




In [26]:
query = '''
SELECT C.C_NAME
FROM customer C
JOIN nation N
ON C.C_NATIONKEY = N.N_NATIONKEY
WHERE N.N_NAME = 'PERU';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_NAME
0,Customer#000000008
1,Customer#000000033
2,Customer#000000035
3,Customer#000000061
4,Customer#000000077
...,...
5970,Customer#000149914
5971,Customer#000149928
5972,Customer#000149939
5973,Customer#000149948


In [37]:
%%pydough

customers_from_peru = customers.WHERE(nation.name == "PERU").CALCULATE(c_name=name)

output= pydough.to_df(customers_from_peru)
output

Unnamed: 0,c_name
0,Customer#000000008
1,Customer#000000033
2,Customer#000000035
3,Customer#000000061
4,Customer#000000077
...,...
5970,Customer#000149914
5971,Customer#000149928
5972,Customer#000149939
5973,Customer#000149948


In [38]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 10.  Retrieve the customer IDs and names of customers who have a negative account balance, are not from Brazil, live in the Americas region, and have placed more than 5 orders. 




In [39]:
query = '''
SELECT c.c_custkey, c.c_name
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE c.c_acctbal < 0
  AND n.n_name != 'BRAZIL'
  AND r.r_name = 'AMERICA'
GROUP BY c.c_custkey, c.c_name
HAVING COUNT(o.o_orderkey) > 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME
0,64,Customer#000000064
1,478,Customer#000000478
2,488,Customer#000000488
3,632,Customer#000000632
4,872,Customer#000000872
...,...,...
1412,149812,Customer#000149812
1413,149815,Customer#000149815
1414,149831,Customer#000149831
1415,149890,Customer#000149890


In [40]:
%%pydough 

customer_in_debt = customers.CALCULATE(
    c_id=key,
    c_name=name,
).WHERE(
    (acctbal < 0) &  
    (COUNT(orders.key) > 5) &  
    (nation.region.name == "AMERICA") & 
    (nation.name != "BRAZIL") 
)

output = pydough.to_df(customer_in_debt)
output


Unnamed: 0,c_id,c_name
0,64,Customer#000000064
1,478,Customer#000000478
2,488,Customer#000000488
3,632,Customer#000000632
4,872,Customer#000000872
...,...,...
1412,149812,Customer#000149812
1413,149815,Customer#000149815
1414,149831,Customer#000149831
1415,149890,Customer#000149890


In [41]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 11. Find the total number of orders per customers placed in 1998




In [42]:
query = '''
SELECT
    c.c_custkey,
    c.c_name,
    COUNT(o.o_orderkey) AS num_orders
FROM
    customer c
JOIN orders o ON c.c_custkey = o.o_custkey
WHERE
    strftime('%Y', o.o_orderdate) = '1998'  
GROUP BY
    c.c_custkey, c.c_name
ORDER BY
    num_orders DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,num_orders
0,11719,Customer#000011719,9
1,93778,Customer#000093778,9
2,102295,Customer#000102295,9
3,111394,Customer#000111394,9
4,4789,Customer#000004789,8
...,...,...,...
71052,149971,Customer#000149971,1
71053,149977,Customer#000149977,1
71054,149981,Customer#000149981,1
71055,149990,Customer#000149990,1


In [43]:
%%pydough

selected_orders = orders.WHERE(
    (YEAR(order_date) == 1998)
).CALCULATE(key=key)

output = customers.CALCULATE(
    C_CUSTKEY=key,
    C_NAME=name,
    NUM_ORDERS=COUNT(selected_lines.key),
).WHERE(NUM_ORDERS > 0).ORDER_BY(NUM_ORDERS.DESC())

output= pydough.to_df(output)
output

Unnamed: 0,C_CUSTKEY,C_NAME,NUM_ORDERS
0,11719,Customer#000011719,9
1,93778,Customer#000093778,9
2,102295,Customer#000102295,9
3,111394,Customer#000111394,9
4,4789,Customer#000004789,8
...,...,...,...
71052,149971,Customer#000149971,1
71053,149977,Customer#000149977,1
71054,149981,Customer#000149981,1
71055,149990,Customer#000149990,1


In [44]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 12.  List the names of nations and the count of orders placed by customers from each nation, ordered by the number of orders in descending order.




In [45]:
query = '''
SELECT n.n_name, COUNT(o.o_orderkey) AS order_count
FROM nation n
JOIN customer c ON n.n_nationkey = c.c_nationkey
JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY n.n_name
ORDER BY order_count DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,order_count
0,FRANCE,61600
1,RUSSIA,61495
2,INDONESIA,61377
3,MOZAMBIQUE,61267
4,ROMANIA,61012
5,CHINA,60784
6,JORDAN,60736
7,CANADA,60480
8,VIETNAM,60347
9,BRAZIL,60137


In [46]:
%%pydough

orders_by_nation = orders.CALCULATE(
    o_keys=key,
    region_name=customer.nation.name 
)

grouped_orders = PARTITION(
    orders_by_nation, name="o", by=region_name
).CALCULATE(
    region_name,
    orders_count=COUNT(o.o_keys)
).ORDER_BY(orders_count.DESC())

output= pydough.to_df(grouped_orders)
output


Unnamed: 0,region_name,orders_count
0,FRANCE,61600
1,RUSSIA,61495
2,INDONESIA,61377
3,MOZAMBIQUE,61267
4,ROMANIA,61012
5,CHINA,60784
6,JORDAN,60736
7,CANADA,60480
8,VIETNAM,60347
9,BRAZIL,60137


In [47]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 13.  List the number of orders placed each month in the year 1998, ordered by month.

In [48]:
query = '''
SELECT
    strftime('%m', o_orderdate) AS order_month,
    COUNT(o_orderkey) AS num_orders            
FROM
    orders
WHERE
    o_orderdate >= '1998-01-01'  
    AND o_orderdate < '1999-01-01'
GROUP BY
    order_month
ORDER BY
    order_month;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [49]:
%%pydough

selected_lines = orders.CALCULATE(key=key,  o_month=MONTH(order_date)).WHERE(
    (YEAR(order_date) == 1998)
)

output = PARTITION(selected_lines, name="o", by=(o_month)).CALCULATE(
    order_month=o_month,
    num_orders= COUNT(o.key)
)

output=pydough.to_df(output)
output


Unnamed: 0,order_month,num_orders
0,1,19380
1,2,17510
2,3,19462
3,4,18677
4,5,19432
5,6,18590
6,7,19373
7,8,1199


In [50]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

## 14. Identify the customer IDs, names, and total spending of customers from the Asia region who have spent more than 1000 in total on orders.




In [51]:
query = '''
SELECT c.c_custkey, c.c_name, SUM(o.o_totalprice) AS total_spent
FROM customer c
JOIN orders o ON c.c_custkey = o.o_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
WHERE r.r_name = 'ASIA'
GROUP BY c.c_custkey, c.c_name
HAVING SUM(o.o_totalprice) > 1000;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,total_spent
0,7,Customer#000000007,2957861.160000
1,19,Customer#000000019,3611713.600000
2,25,Customer#000000025,3135039.320000
3,28,Customer#000000028,2429022.210000
4,37,Customer#000000037,2860377.420000
...,...,...,...
20019,149980,Customer#000149980,3115223.230000
20020,149981,Customer#000149981,1700503.960000
20021,149984,Customer#000149984,1153164.880000
20022,149987,Customer#000149987,472026.460000


In [52]:
%%pydough

filter_c= customers.CALCULATE(
        c_key= key,
        c_name=name,
        TOTAL_PRICE=SUM(orders.total_price)
       ).WHERE((TOTAL_PRICE > 1000 ) & (nation.region.name == "ASIA"))

output=pydough.to_df(filter_c)
output

Unnamed: 0,c_key,c_name,TOTAL_PRICE
0,7,Customer#000000007,2957861.160000
1,19,Customer#000000019,3611713.600000
2,25,Customer#000000025,3135039.320000
3,28,Customer#000000028,2429022.210000
4,37,Customer#000000037,2860377.420000
...,...,...,...
20019,149980,Customer#000149980,3115223.230000
20020,149981,Customer#000149981,1700503.960000
20021,149984,Customer#000149984,1153164.880000
20022,149987,Customer#000149987,472026.460000


In [53]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 15. Calculate the average order value for each region.




In [54]:
query = '''
SELECT 
    r.r_name AS Region, 
    AVG(o.o_totalprice) AS AvgOrderValue 
FROM 
    orders o
JOIN 
    customer c ON o.o_custkey = c.c_custkey
JOIN 
    nation n ON c.c_nationkey = n.n_nationkey
JOIN 
    region r ON n.n_regionkey = r.r_regionkey
GROUP BY 
    r.r_name;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,Region,AvgOrderValue
0,AFRICA,151274.687459
1,AMERICA,151476.057596
2,ASIA,151167.942741
3,EUROPE,150990.370343
4,MIDDLE EAST,151192.10578


In [55]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_region_name=nation.region.name
).orders.CALCULATE(
    orders_price=total_price
)

output = PARTITION(selected_customers, "cust", by=customer_region_name).CALCULATE(
    REGION_NAME=customer_region_name,
    TOTALREVENUE= AVG(cust.orders_price)
)

output=pydough.to_df(output)
output


Unnamed: 0,REGION_NAME,TOTALREVENUE
0,AFRICA,151274.687459
1,AMERICA,151476.057596
2,ASIA,151167.942741
3,EUROPE,150990.370343
4,MIDDLE EAST,151192.10578


In [56]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 16. Find the top 5 regions with the highest total revenue from orders. 




In [57]:
query = '''
SELECT 
    r.r_name AS RegionName, 
    SUM(o.o_totalprice) AS TotalRevenue
FROM 
    region r
JOIN nation n ON r.r_regionkey = n.n_regionkey
JOIN customer c ON n.n_nationkey = c.c_nationkey
JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY 
    r.r_name
ORDER BY 
    TotalRevenue DESC
LIMIT 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,RegionName,TotalRevenue
0,EUROPE,45793265459.71
1,ASIA,45613415042.56
2,AMERICA,45306943255.21
3,AFRICA,45230223902.22
4,MIDDLE EAST,44885458787.76


In [58]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_region_name=nation.region.name
).orders.CALCULATE(
    orders_price=total_price
)

output = PARTITION(selected_customers, "cust", by=customer_region_name).CALCULATE(
    REGION_NAME=customer_region_name,
    TOTALREVENUE= SUM(cust.orders_price)
).TOP_K(5, by=TOTALREVENUE.DESC())

output=pydough.to_df(output)
output

Unnamed: 0,REGION_NAME,TOTALREVENUE
0,EUROPE,45793265459.71
1,ASIA,45613415042.56
2,AMERICA,45306943255.21
3,AFRICA,45230223902.22
4,MIDDLE EAST,44885458787.76


In [59]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 17. For each region and nation, calculate the maximum and minimum order values, the difference between them, and the total number of orders, ordered by the order value difference in descending order.




In [60]:
query = '''
SELECT 
    r.r_name AS region_name,
    n.n_name AS nation_name,
    MAX(o.o_totalprice) AS max_order_value,
    MIN(o.o_totalprice) AS min_order_value,
    MAX(o.o_totalprice) - MIN(o.o_totalprice) AS order_value_difference,
    COUNT(o.o_orderkey) AS total_orders
FROM region r
JOIN nation n ON r.r_regionkey = n.n_regionkey  
JOIN customer c ON c.c_nationkey = n.n_nationkey
JOIN orders o ON o.o_custkey = c.c_custkey
GROUP BY r.r_name, n.n_name
ORDER BY order_value_difference DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,nation_name,max_order_value,min_order_value,order_value_difference,total_orders
0,EUROPE,RUSSIA,555285.16,932.41,554352.75,61495
1,AMERICA,PERU,544089.09,891.74,543197.35,59018
2,AMERICA,ARGENTINA,530604.44,877.3,529727.14,59547
3,AMERICA,UNITED STATES,525590.57,913.45,524677.12,59921
4,MIDDLE EAST,IRAN,522644.48,924.51,521719.97,59675
5,AMERICA,CANADA,515531.82,908.18,514623.64,60480
6,EUROPE,FRANCE,508668.52,885.75,507782.77,61600
7,AFRICA,MOZAMBIQUE,508047.99,896.59,507151.4,61267
8,ASIA,VIETNAM,504509.06,911.67,503597.39,60347
9,ASIA,JAPAN,502742.76,857.71,501885.05,59405


In [61]:
%%pydough

selected_orders = customers.CALCULATE(
    region_name=nation.region.name,
    nation_name=nation.name
).orders.CALCULATE(
    total_price=total_price
)


output = PARTITION(selected_orders, name="o", by=(region_name, nation_name)).CALCULATE(
    region_name,
    nation_name,
    max_order_value=MAX(o.total_price),
    min_order_value=MIN(o.total_price),
    order_value_difference=MAX(o.total_price) - MIN(o.total_price),
    total_orders=COUNT(o.total_price)
).ORDER_BY(order_value_difference.DESC())

output=pydough.to_df(output)
output


Unnamed: 0,region_name,nation_name,max_order_value,min_order_value,order_value_difference,total_orders
0,EUROPE,RUSSIA,555285.16,932.41,554352.75,61495
1,AMERICA,PERU,544089.09,891.74,543197.35,59018
2,AMERICA,ARGENTINA,530604.44,877.3,529727.14,59547
3,AMERICA,UNITED STATES,525590.57,913.45,524677.12,59921
4,MIDDLE EAST,IRAN,522644.48,924.51,521719.97,59675
5,AMERICA,CANADA,515531.82,908.18,514623.64,60480
6,EUROPE,FRANCE,508668.52,885.75,507782.77,61600
7,AFRICA,MOZAMBIQUE,508047.99,896.59,507151.4,61267
8,ASIA,VIETNAM,504509.06,911.67,503597.39,60347
9,ASIA,JAPAN,502742.76,857.71,501885.05,59405


In [62]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 18. List the nations and the count of customers in the "Machinery" and "Automobile" market segments, ordered by the number of customers in descending order.




In [63]:
query = '''
SELECT 
    n.n_name AS nation_name,
    COUNT(c.c_custkey) AS customer_count
FROM nation n
JOIN customer c ON c.c_nationkey = n.n_nationkey
WHERE c.c_mktsegment IN ('MACHINERY', 'AUTOMOBILE') 
GROUP BY n.n_name
ORDER BY customer_count DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,nation_name,customer_count
0,ROMANIA,2545
1,INDONESIA,2489
2,CHINA,2481
3,ETHIOPIA,2423
4,BRAZIL,2419
5,RUSSIA,2414
6,EGYPT,2414
7,GERMANY,2402
8,UNITED STATES,2399
9,JORDAN,2397


In [64]:
%%pydough

selected_customers = customers.WHERE(ISIN(mktsegment, ('MACHINERY', 'AUTOMOBILE'))).CALCULATE(
    nation_name=nation.name, 
    key=key)

output= PARTITION(selected_customers, name="cust", by=(nation_name)).CALCULATE(
    nation_name= nation_name,
    customer_count= COUNT(cust.key)
).ORDER_BY(customer_count.DESC())

output=pydough.to_df(output)
output


Unnamed: 0,nation_name,customer_count
0,ROMANIA,2545
1,INDONESIA,2489
2,CHINA,2481
3,ETHIOPIA,2423
4,BRAZIL,2419
5,EGYPT,2414
6,RUSSIA,2414
7,GERMANY,2402
8,UNITED STATES,2399
9,JORDAN,2397


In [65]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.True_

## 19. Calculate the percentage of high-priority orders (e.g., '1-URGENT', '2-HIGH') for each region.
 




In [66]:
query = '''
SELECT r.r_name AS region_name, 
  ROUND(
    SUM(
      CASE 
        WHEN o.o_orderpriority IN ('1-URGENT', '2-HIGH') THEN 1 
        ELSE 0 
      END
    ) * 100.0 / COUNT(o.o_orderkey),
    2
  ) AS high_priority_percentage
  
FROM orders o
JOIN customer c ON o.o_custkey = c.c_custkey
JOIN nation n ON c.c_nationkey = n.n_nationkey
JOIN region r ON n.n_regionkey = r.r_regionkey
GROUP BY r.r_name
ORDER BY high_priority_percentage DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,high_priority_percentage
0,MIDDLE EAST,40.2
1,AMERICA,40.16
2,EUROPE,39.99
3,ASIA,39.91
4,AFRICA,39.89


In [67]:
%%pydough

selected_orders = customers.CALCULATE(
    region_name=nation.region.name
).orders.CALCULATE(
    key,
    is_prioritary=IFF(ISIN(order_priority, ('1-URGENT', '2-HIGH')), 1, 0)
)

output = PARTITION(selected_orders, name="o", by=region_name).CALCULATE(
    region_name,
    high_priority_percentage=ROUND((SUM(o.is_prioritary) * 100) / COUNT(o.key), 2)
).ORDER_BY(high_priority_percentage.DESC())

output=pydough.to_df(output)
output



Unnamed: 0,region_name,high_priority_percentage
0,MIDDLE EAST,40.2
1,AMERICA,40.16
2,EUROPE,39.99
3,ASIA,39.91
4,AFRICA,39.89


In [68]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 20.  Customers Who Have Never Placed Orders




In [69]:
query = '''
SELECT c.c_custkey, c.c_name
FROM customer c
LEFT JOIN orders o ON c.c_custkey = o.o_custkey
WHERE o.o_orderkey IS NULL;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME
0,3,Customer#000000003
1,6,Customer#000000006
2,9,Customer#000000009
3,12,Customer#000000012
4,15,Customer#000000015
...,...,...
49999,149988,Customer#000149988
50000,149991,Customer#000149991
50001,149994,Customer#000149994
50002,149997,Customer#000149997


In [70]:
%%pydough

customers_without_orders= customers.WHERE(HASNOT(orders)==1).CALCULATE(key, name)

output=pydough.to_df(customers_without_orders)
output

Unnamed: 0,key,name
0,3,Customer#000000003
1,6,Customer#000000006
2,9,Customer#000000009
3,12,Customer#000000012
4,15,Customer#000000015
...,...,...
49999,149988,Customer#000149988
50000,149991,Customer#000149991
50001,149994,Customer#000149994
50002,149997,Customer#000149997


In [71]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

## 21. How many total, active, and inactive customers are there in each nation, sorted by the total number of customers?




In [72]:
query = '''
SELECT
    n.n_name,
    COUNT(DISTINCT c.c_custkey) AS total_customers,
    COUNT(DISTINCT CASE WHEN o.o_orderkey IS NOT NULL THEN c.c_custkey END) AS active_customers,
    COUNT(DISTINCT CASE WHEN o.o_orderkey IS NULL THEN c.c_custkey END) AS inactive_customers
FROM
    nation n
JOIN customer c ON n.n_nationkey = c.c_nationkey
LEFT JOIN orders o ON c.c_custkey = o.o_custkey
GROUP BY n.n_name
ORDER BY total_customers DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,total_customers,active_customers,inactive_customers
0,INDONESIA,6161,4081,2080
1,ROMANIA,6100,4087,2013
2,FRANCE,6100,4149,1951
3,RUSSIA,6078,4089,1989
4,INDIA,6042,3958,2084
5,JORDAN,6033,4025,2008
6,CHINA,6024,4011,2013
7,CANADA,6020,4006,2014
8,UNITED KINGDOM,6011,3989,2022
9,IRAN,6009,4013,1996


In [73]:
%%pydough

selected_customers = customers.CALCULATE(
    customer_nation_name= nation.name, 
    active_customers=KEEP_IF(key,HAS(orders)),
    inactive_customers= KEEP_IF(key, HASNOT(orders))
)
output = PARTITION(selected_customers, "cust", by=customer_nation_name).CALCULATE(
    N_NAME=customer_nation_name,
    total_customers= COUNT(cust.key),
    active_customers=NDISTINCT(cust.active_customers),
    inactive_customers=NDISTINCT(cust.inactive_customers),
).ORDER_BY(total_customers.DESC())

output=pydough.to_df(output)
output

Unnamed: 0,N_NAME,total_customers,active_customers,inactive_customers
0,INDONESIA,6161,4081,2080
1,FRANCE,6100,4149,1951
2,ROMANIA,6100,4087,2013
3,RUSSIA,6078,4089,1989
4,INDIA,6042,3958,2084
5,JORDAN,6033,4025,2008
6,CHINA,6024,4011,2013
7,CANADA,6020,4006,2014
8,UNITED KINGDOM,6011,3989,2022
9,IRAN,6009,4013,1996


In [74]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.True_

## 22. Retrieve customers who belong to the top 10% in account balance but rank in the bottom 25% in terms of order activity

In [75]:
query = '''
SELECT c_name, c_acctbal
FROM (
    SELECT 
        c.c_name,
        c.c_acctbal,
        PERCENT_RANK() OVER (ORDER BY c.c_acctbal DESC) AS balance_percentile,
        PERCENT_RANK() OVER (ORDER BY COUNT(o.o_orderkey)) AS order_activity_percentile
    FROM customer c
    LEFT JOIN orders o ON c.c_custkey = o.o_custkey
    GROUP BY c.c_custkey, c.c_name, c.c_acctbal
) sub
WHERE 
    balance_percentile <= 0.1  
    AND order_activity_percentile <= 0.25 
ORDER BY c_acctbal DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,c_name,c_acctbal
0,Customer#000069321,9999.960000
1,Customer#000002487,9999.720000
2,Customer#000043044,9999.490000
3,Customer#000076146,9999.230000
4,Customer#000034047,9998.970000
...,...,...
4941,Customer#000115446,8894.600000
4942,Customer#000082611,8894.490000
4943,Customer#000013560,8894.430000
4944,Customer#000078429,8894.390000


In [76]:
%%pydough

customer_orders = customers.CALCULATE(
    key,
    name,
    acctbal,
    num_orders=COUNT(orders.key)  
)

selected_customers = customer_orders.CALCULATE(
    key,
    name,
    acctbal,
    balance_percentile=PERCENTILE(by=acctbal.DESC()),  
    order_activity_percentile=PERCENTILE(by=num_orders.ASC())
).WHERE(
    (balance_percentile <= 10) & (order_activity_percentile <= 25)
).ORDER_BY(acctbal.DESC())

output = pydough.to_df(selected_customers)
output




Unnamed: 0,key,name,acctbal,balance_percentile,order_activity_percentile
0,69321,Customer#000069321,9999.960000,1,16
1,2487,Customer#000002487,9999.720000,1,1
2,43044,Customer#000043044,9999.490000,1,10
3,76146,Customer#000076146,9999.230000,1,17
4,34047,Customer#000034047,9998.970000,1,8
...,...,...,...,...,...
3708,62682,Customer#000062682,8894.780000,10,14
3709,82611,Customer#000082611,8894.490000,10,19
3710,13560,Customer#000013560,8894.430000,10,4
3711,78429,Customer#000078429,8894.390000,10,18


In [77]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

False

## 23. Which region has the highest total supply cost, considering the supply cost and available quantity for suppliers across different nations?


In [78]:
query = '''
SELECT r_name AS region_name, SUM(ps_supplycost * ps_availqty) AS total_supply_cost
FROM region
JOIN nation ON r_regionkey = n_regionkey
JOIN supplier ON n_nationkey = s_nationkey
JOIN partsupp ON s_suppkey = ps_suppkey
GROUP BY r_name
ORDER BY total_supply_cost DESC;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,region_name,total_supply_cost
0,AMERICA,407942718701.84
1,MIDDLE EAST,405058070978.87
2,ASIA,400599873546.96
3,EUROPE,397934639557.59
4,AFRICA,392074106221.66


In [79]:
%%pydough

supply= supply_records.CALCULATE(
    region_name= supplier.nation.region.name, 
    supply_cost= supplycost, 
    availqty= availqty
)

supply_cost_by_regions= PARTITION(supply, name="supp", by=region_name).CALCULATE(
    region_name,
    total_supply_cost= SUM(supp.supplycost * supp.availqty)
).ORDER_BY(total_supply_cost.DESC())


output=pydough.to_df(supply_cost_by_regions)
output

Unnamed: 0,region_name,total_supply_cost
0,AMERICA,407942718701.84
1,MIDDLE EAST,405058070978.87
2,ASIA,400599873546.96
3,EUROPE,397934639557.59
4,AFRICA,392074106221.66


In [80]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

True

In [81]:
query = '''
SELECT n_name AS nation_name, COUNT(DISTINCT c_custkey) AS total_customers, COUNT(DISTINCT s_suppkey) AS total_suppliers 
FROM nation 
LEFT JOIN customer ON n_nationkey = c_nationkey 
LEFT JOIN supplier ON n_nationkey = s_nationkey 
GROUP BY n_name;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,nation_name,total_customers,total_suppliers
0,ALGERIA,5925,420
1,ARGENTINA,5975,413
2,BRAZIL,5999,397
3,CANADA,6020,412
4,CHINA,6024,407
5,EGYPT,5995,415
6,ETHIOPIA,5952,380
7,FRANCE,6100,402
8,GERMANY,5908,396
9,INDIA,6042,415


In [82]:
%%pydough

customer_and_supplier_count_by_nation = nations.CALCULATE(
    nation_name=name,
    customer_count=COUNT(customers),
    supplier_count=COUNT(suppliers)
)

output=pydough.to_df(customer_and_supplier_count_by_nation)
output

Unnamed: 0,nation_name,customer_count,supplier_count
0,ALGERIA,5925,420
1,ARGENTINA,5975,413
2,BRAZIL,5999,397
3,CANADA,6020,412
4,EGYPT,5995,415
5,ETHIOPIA,5952,380
6,FRANCE,6100,402
7,GERMANY,5908,396
8,INDIA,6042,415
9,INDONESIA,6161,405


In [83]:
dfcompare.compare_df(output, sql_output, query_category="", question="")

np.False_

# Apple Queries

## 1. What is the latest month SPM for a specific part in China?

Original Query: What is the current SPM for Bluetooth on iPhone 15 Plus in China?

SPM (Selling Profit Margin) = (Total Amount from Sells - (Tax + Commission)) / Total Amount from Sells * 100

In [84]:
query = '''
SELECT 
    n.n_NAME AS n_name,
    p.p_NAME AS p_name,
    100 * (
        SUM(l.l_EXTENDEDPRICE) - SUM(l.l_TAX + (l.l_EXTENDEDPRICE * l.l_DISCOUNT))
    ) / SUM(l.l_EXTENDEDPRICE) AS spm
FROM 
    CUSTOMER c
JOIN 
    NATION n ON c.c_NATIONKEY = n.n_NATIONKEY
JOIN 
    ORDERS o ON c.c_CUSTKEY = o.o_CUSTKEY
JOIN 
    LINEITEM l ON o.o_ORDERKEY = l.l_ORDERKEY
JOIN 
    PART p ON l.l_PARTKEY = p.p_PARTKEY
WHERE 
    n.n_NAME = 'CHINA'
    AND p.p_NAME = 'almond antique blue royal burnished'
GROUP BY 
    p.p_NAME;


'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,n_name,p_name,spm
0,CHINA,almond antique blue royal burnished,92.833131


In [5]:
%%pydough

customers_in_china = customers.WHERE(nation.name == "CHINA").CALCULATE(n_name=nation.name)

customer_orders_feb = customers_in_china.orders.WHERE(
    (YEAR(order_date) == 1996) &  
    (MONTH(order_date) == 2)      
).CALCULATE(n_name=n_name)

order_lineitems_feb = customer_orders_feb.lines.WHERE(
    part.name == "almond antique blue royal burnished"
).CALCULATE(
    n_name,
    part_name=part.name,
    extended_price=extended_price,
    tax=tax,
    discount=extended_price * discount,
)

spm_feb_1996 = PARTITION(order_lineitems_feb, name="li", by=part_name).CALCULATE(
    part_name,
    nation= MAX(li.n_name),
    spm_feb=100 * (SUM(li.extended_price) - SUM(li.tax + li.discount)) / SUM(li.extended_price)
)

pydough.to_df(spm_feb_1996) 


Unnamed: 0,part_name,nation,spm_feb
0,almond antique blue royal burnished,CHINA,91.999919


#### “Compare that to the last month with that order, have we seen an increase?” 

Original query: Compare that to last quarter—have we seen an increase?

In [6]:
%%pydough

customer_orders_nov = customers_in_china.orders.WHERE(
    (YEAR(order_date) == 1995) &  
    (MONTH(order_date) == 11)      
).CALCULATE(n_name=n_name)

order_lineitems_nov = customer_orders_nov.lines.WHERE(
    part.name == "almond antique blue royal burnished"
).CALCULATE(
    n_name,
    extended_price=extended_price,
    part_name=part.name,
    tax=tax,
    discount=extended_price * discount
)

spm_nov_1995 = PARTITION(order_lineitems_nov, name="li", by=part_name).CALCULATE(
    MAX(li.n_name),
    part_name,
    spm_nov=100 * (SUM(li.extended_price) - SUM(li.tax + li.discount)) / SUM(li.extended_price)
)

spm_comparison = TPCH.CALCULATE(
    part_name=MAX(spm_feb_1996.part_name),
    spm_feb=MAX(spm_feb_1996.spm_feb), 
    spm_nov=MAX(spm_nov_1995.spm_nov),
    spm_change=MAX(spm_feb_1996.spm_feb) - MAX(spm_nov_1995.spm_nov)
)

pydough.to_df(spm_comparison)


Unnamed: 0,part_name,spm_feb,spm_nov,spm_change
0,almond antique blue royal burnished,91.999919,96.99919,-4.999271


#### "Now exclude supplier Supplier#000001305 and focus only on supplier Supplier#000008802"

Original query: “Now exclude vendor B and focus only on vendor A."

In [7]:
%%pydough

order_lineitems_feb = customer_orders_feb.lines.WHERE(
    (MAX(part.supply_records.supplier.name) == "Supplier#000008802") &
    (MAX(part.supply_records.supplier.name) != "Supplier#000001305") &
    (part.name == "almond antique blue royal burnished")    
).CALCULATE(
    n_name,
    extended_price=extended_price,
    tax=tax,
    discount=extended_price * discount,
    part_name=part.name
)

spm_feb_1996 = PARTITION(order_lineitems_feb, name="li", by=part_name).CALCULATE(
    part_name,
    nation=MAX(li.n_name),
    spm_feb=100 * (SUM(li.extended_price) - SUM(li.tax + li.discount)) / SUM(li.extended_price)
)

order_lineitems_nov = customer_orders_nov.lines.WHERE(
    (MAX(part.supply_records.supplier.name) == "Supplier#000008802") &
    (MAX(part.supply_records.supplier.name) != "Supplier#000001305") &
    (part.name == "almond antique blue royal burnished")
).CALCULATE(
    n_name,
    extended_price=extended_price,
    tax=tax,
    discount=extended_price * discount,
    part_name=part.name
)

spm_nov_1995 = PARTITION(order_lineitems_nov, name="li", by=part_name).CALCULATE(
    part_name,
    nation=MAX(li.n_name),
    spm_nov=100 * (SUM(li.extended_price) - SUM(li.tax + li.discount)) / SUM(li.extended_price)
)

spm_comparison_by_supplier = TPCH.CALCULATE(
    part_name=MAX(spm_feb_1996.part_name),
    spm_feb=MAX(spm_feb_1996.spm_feb), 
    spm_nov=MAX(spm_nov_1995.spm_nov),
    spm_change=MAX(spm_feb_1996.spm_feb) - MAX(spm_nov_1995.spm_nov) 
)

pydough.to_df(spm_comparison_by_supplier)

Unnamed: 0,part_name,spm_feb,spm_nov,spm_change
0,almond antique blue royal burnished,91.999919,96.99919,-4.999271


# What is Pydough Queries

### List customers who ordered in 1996 but not in 1997, with name, email, last 2021 order date, total spent (over $200), and months since last order. Sort by total spent, highest first.

In [86]:
%%pydough


customer_orders = customers.CALCULATE(
    name=name,
    last_order_date=MAX(orders.WHERE(YEAR(order_date) == 1996).order_date),  
    total_spent_1996=SUM(orders.WHERE(YEAR(order_date) == 1996).total_price),  
).WHERE(
    (last_order_date != 'None') 
    & (total_spent_1996 > 200)
).CALCULATE(
    name,
    last_order_date,
    total_spent_1996,
    months_since_last_order=DATEDIFF("months", last_order_date, DATETIME("now")))


retained_customers = customer_orders.WHERE( 
    HASNOT(orders.WHERE(YEAR(order_date) == 1997))==1
).ORDER_BY(total_spent_1996.DESC())

pydough.to_df(retained_customers)


Unnamed: 0,name,last_order_date,total_spent_1996,months_since_last_order
0,Customer#000001948,1996-12-09,1785971.270000,338
1,Customer#000057892,1996-12-05,1668071.250000,338
2,Customer#000111028,1996-11-08,1626293.120000,339
3,Customer#000112711,1996-10-13,1623258.350000,340
4,Customer#000113131,1996-12-22,1597282.310000,338
...,...,...,...,...
10773,Customer#000071444,1996-05-14,1720.530000,345
10774,Customer#000007568,1996-11-20,1407.540000,339
10775,Customer#000034829,1996-02-04,1239.440000,348
10776,Customer#000096971,1996-12-07,1132.690000,338


### "List products with total stock below 20% of their 6-month average monthly sales. Include product ID, name, total stock, average monthly sales, months of stock (stock/avg sales), category, and supplier. Only include products in stock for at least 3 months. Sort by months of stock, lowest first.

## RTF Queries

### 1. For every year, identify how many customers made their only order ever in that year.

In [87]:
query = '''
SELECT year, COUNT(*) AS n_only_order
FROM (
    SELECT strftime('%Y', MIN(o_orderdate)) AS year
    FROM orders
    GROUP BY o_custkey
    HAVING COUNT(*) = 1
)
GROUP BY year
ORDER BY year ASC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,n_only_order
0,1992,1
1,1993,2
2,1994,2
3,1995,1
4,1996,6
5,1997,3
6,1998,2


In [None]:
%%pydough

single_order_customers = PARTITION(orders, name="o", by=customer_key).CALCULATE( 
    customer_key, 
    first_order_year=YEAR(MIN(o.order_date)),  
    order_count=COUNT(o.key)  
).WHERE(order_count == 1)  

only_orders_per_year = PARTITION(single_order_customers, name="s", by=first_order_year).CALCULATE(
    year=first_order_year,
    n_only_order=COUNT(s.customer_key)  
).ORDER_BY(year.ASC())

pydough.to_df(only_orders_per_year)

Unnamed: 0,year,n_only_order
0,1992,1
1,1993,2
2,1994,2
3,1995,1
4,1996,6
5,1997,3
6,1998,2


### 2. For every year, identify how many customers made only a single order in that year.

In [64]:
query = '''
SELECT year, COUNT(*) AS n_only_order
FROM (
    SELECT strftime('%Y', O_ORDERDATE) AS year
    FROM ORDERS
    GROUP BY O_CUSTKEY, year
    HAVING COUNT(*) = 1
) AS single_order_customers
GROUP BY year
ORDER BY year ASC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,n_only_order
0,1992,23981
1,1993,24006
2,1994,23876
3,1995,23571
4,1996,23682
5,1997,23823
6,1998,33226


In [None]:
%%pydough

orders_with_year = orders.CALCULATE(
    customer_key,
    order_year=YEAR(order_date) 
)

single_order_customers_per_year = PARTITION(orders_with_year, name="o", by=(customer_key, order_year)).CALCULATE(
    customer_key, 
    order_year,  
    order_count=COUNT(o.key)  
).WHERE(order_count == 1)  

only_orders_per_year = PARTITION(single_order_customers_per_year, name="s", by=order_year).CALCULATE(
    year=order_year,
    n_only_order=COUNT(s.customer_key)  
).ORDER_BY(year.ASC())


pydough.to_df(only_orders_per_year)

Unnamed: 0,year,n_only_order
0,1992,23981
1,1993,24006
2,1994,23876
3,1995,23571
4,1996,23682
5,1997,23823
6,1998,33226


### 3. For every year, identify what percentage of all orders were the only order made by that customer in that year.

In [6]:
query = '''
SELECT 
    year, 
    100.0 * SUM(CASE WHEN n_orders = 1 THEN 1 ELSE 0 END) / SUM(n_orders) AS pct_only_order
FROM (
    SELECT 
        strftime('%Y', O_ORDERDATE) AS year, 
        COUNT(*) AS n_orders
    FROM ORDERS
    GROUP BY O_CUSTKEY, year
)
GROUP BY year
ORDER BY year ASC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,pct_only_order
0,1992,10.560177
1,1993,10.591895
2,1994,10.490472
3,1995,10.309355
4,1996,10.358402
5,1997,10.458638
6,1998,24.86548


In [27]:
%%pydough


orders_with_year = orders.CALCULATE(
    customer_key,
    order_year=YEAR(order_date) 
)

customer_orders_per_year = PARTITION(orders_with_year, name="o", by=(customer_key, order_year)).CALCULATE(
    customer_key, 
    order_year,
    order_count=COUNT(o.key)  
)

single_order_customers_per_year = PARTITION(customer_orders_per_year, name="s", by=order_year).CALCULATE(
    year=order_year,
    total_orders=SUM(s.order_count),
    single_orders=SUM(IFF(s.order_count == 1, 1, 0)) 
)

only_order_percentage = single_order_customers_per_year.CALCULATE(
    year,
    pct_only_order=100.0 * single_orders / total_orders 
).ORDER_BY(year.ASC())


pydough.to_df(only_order_percentage)


Unnamed: 0,year,pct_only_order
0,1992,10.560177
1,1993,10.591895
2,1994,10.490472
3,1995,10.309355
4,1996,10.358402
5,1997,10.458638
6,1998,24.86548


### 4. For every year, identify how many customers made their first ever purchase in that year

In [None]:
query = '''
WITH FirstOrder AS (
    SELECT 
        strftime('%Y', O_ORDERDATE) AS year,
        ROW_NUMBER() OVER (PARTITION BY O_CUSTKEY ORDER BY O_ORDERDATE) AS rn
    FROM ORDERS
)
SELECT 
    year, 
    COUNT(*) AS pct_only_order
FROM FirstOrder
WHERE rn = 1
GROUP BY year
ORDER BY year ASC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,pct_only_order
0,1992,86577
1,1993,10870
2,1994,2009
3,1995,414
4,1996,100
5,1997,22
6,1998,4


In [37]:
%%pydough

first_order_per_customer = PARTITION(orders, name="o", by=customer_key).CALCULATE(
    customer_key,
    first_order_year=YEAR(MIN(o.order_date)) 
)

first_orders_per_year = PARTITION(first_order_per_customer, name="s", by=first_order_year).CALCULATE(
    year=first_order_year,
    n_first_orders=COUNT(s.customer_key)  
).ORDER_BY(year.ASC())

pydough.to_df(first_orders_per_year)


Unnamed: 0,year,n_first_orders
0,1992,86577
1,1993,10870
2,1994,2009
3,1995,414
4,1996,100
5,1997,22
6,1998,4


### 5. For every year, identify how many customers made a purchase that year but did not in the previous year

In [39]:
query = '''
WITH OrderYears AS (
    SELECT DISTINCT 
        O_CUSTKEY, 
        strftime('%Y', O_ORDERDATE) AS year
    FROM ORDERS
),
LaggedOrders AS (
    SELECT 
        O_CUSTKEY, 
        year,
        LAG(year, 1, '0') OVER (PARTITION BY O_CUSTKEY ORDER BY year) AS prev_year
    FROM OrderYears
)
SELECT 
    year, 
    COUNT(*) AS n_not_prev
FROM LaggedOrders
WHERE prev_year <> (year - 1)
GROUP BY year
ORDER BY year ASC;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,n_not_prev
0,1992,86577
1,1993,86477
2,1994,86608
3,1995,86573
4,1996,86659
5,1997,86680
6,1998,71057


In [None]:
%%pydough

orders_with_year = orders.CALCULATE(
    customer_key,
    order_year=YEAR(order_date) 
)

# Step 2: Find the latest order year BEFORE the current order year (Replacing LAG())
previous_order_years = PARTITION(customer_order_years, name="p", by=customer_key).CALCULATE(
    customer_key,
    order_year,
    prev_year=MAX(IFF(p.order_year < order_year, p.order_year, None))  
)


pydough.to_df(previous_order_years)

### 6. Which 5 countries have the highest number of customers who have never ordered a package containing a red product.

In [9]:
query = '''
SELECT 
    n.n_name, 
    COUNT(*) AS n_never_ordered
FROM CUSTOMER C
JOIN NATION N ON C.C_NATIONKEY = N.N_NATIONKEY
LEFT JOIN (
    SELECT DISTINCT O_CUSTKEY
    FROM ORDERS
    JOIN LINEITEM ON O_ORDERKEY = L_ORDERKEY
    JOIN PART ON L_PARTKEY = P_PARTKEY
    WHERE P_NAME LIKE '%red%'
) AS HasOrdered ON C.C_CUSTKEY = HasOrdered.O_CUSTKEY
WHERE HasOrdered.O_CUSTKEY IS NULL
GROUP BY N.N_NAME
ORDER BY n_never_ordered DESC
LIMIT 5;
'''


sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,n_never_ordered
0,INDIA,2408
1,PERU,2380
2,UNITED KINGDOM,2379
3,INDONESIA,2379
4,IRAQ,2360


In [None]:
%%pydough

customers_without_red_orders = customers.WHERE(
    HASNOT(orders.lines.WHERE(LIKE(part.name, "%red%")))==1
).CALCULATE(
    n_name = nation.name
)

top_nations = PARTITION(customers_without_red_orders, name="c", by=n_name).CALCULATE(
    n_name,
    n_never_ordered=COUNT(c.key) 
).TOP_K(5, by=n_never_ordered.DESC())

pydough.to_df(top_nations)

Unnamed: 0,n_name,n_never_ordered
0,INDIA,2408
1,PERU,2380
2,INDONESIA,2379
3,UNITED KINGDOM,2379
4,IRAQ,2360


### 7.For every year & month, what nation had the highest percentage of its customers order a package with priority "1-URGENT"? Include the nation and the percentage.

In [18]:
query = '''
WITH T1 AS (
    SELECT DISTINCT
        CAST(strftime('%Y', O_ORDERDATE) AS INTEGER) AS year,
        CAST(strftime('%m', O_ORDERDATE) AS INTEGER) AS month,
        N.N_NATIONKEY AS nationkey,
        N.N_NAME AS name,
        C.C_CUSTKEY AS custkey
    FROM NATION N
    JOIN CUSTOMER C ON N.N_NATIONKEY = C.C_NATIONKEY
    JOIN ORDERS O ON C.C_CUSTKEY = O.O_CUSTKEY
    WHERE O.O_ORDERPRIORITY = '1-URGENT'
),
T2 AS (
    SELECT 
        C_NATIONKEY AS nationkey, 
        COUNT(*) AS n_cust
    FROM CUSTOMER
    GROUP BY C_NATIONKEY
),
Aggregated AS (
    SELECT 
        T1.year, 
        T1.month, 
        T1.name, 
        100.0 * COUNT(*) / (SELECT n_cust FROM T2 WHERE T2.nationkey = T1.nationkey) AS percentage,
        ROW_NUMBER() OVER (PARTITION BY T1.year, T1.month ORDER BY COUNT(*) DESC) AS rn
    FROM T1
    GROUP BY T1.year, T1.month, T1.name
)
SELECT year, month, name, percentage
FROM Aggregated
WHERE rn = 1
ORDER BY year, month;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,month,name,percentage
0,1992,1,INDONESIA,2.791755
1,1992,2,ROMANIA,2.918033
2,1992,3,GERMANY,2.860528
3,1992,4,FRANCE,2.770492
4,1992,5,BRAZIL,3.150525
...,...,...,...,...
75,1998,4,MOZAMBIQUE,2.761969
76,1998,5,VIETNAM,2.912783
77,1998,6,JAPAN,2.874916
78,1998,7,ROMANIA,2.852459


In [49]:
%%pydough

urgent_orders = customers.CALCULATE(
    customer_id=key,
    nation_name=nation.name,
    nation_id=nation_key
).orders.WHERE(order_priority == "1-URGENT").CALCULATE(
    nation_id,
    nation_name,
    order_year=YEAR(order_date),
    order_month=MONTH(order_date)
)

urgent_orders_by = PARTITION(urgent_orders, name="u", by=(nation_id)).CALCULATE(
    nation_id
)

pydough.to_df(urgent_orders_by)


KeyError: $0.nation_id_0

### 8. Identify the 5 suppliers whose average discount rate has increased the most from 1994 to 1997, along with the change in their average discount percentage.

In [12]:
query = '''
WITH LineitemYears AS (
    SELECT 
        L_SUPPKEY,
        CASE WHEN CAST(strftime('%Y', L_SHIPDATE) AS INTEGER) = 1994 THEN L_DISCOUNT ELSE NULL END AS disc94,
        CASE WHEN CAST(strftime('%Y', L_SHIPDATE) AS INTEGER) = 1997 THEN L_DISCOUNT ELSE NULL END AS disc97
    FROM LINEITEM
    WHERE CAST(strftime('%Y', L_SHIPDATE) AS INTEGER) IN (1994, 1997)
)

SELECT 
    S_NAME, 
    100 * (AVG(disc97) - AVG(disc94)) AS pct_change
FROM SUPPLIER
INNER JOIN LineitemYears ON S_SUPPKEY = LineitemYears.L_SUPPKEY
GROUP BY S_NAME
ORDER BY pct_change DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_NAME,pct_change
0,Supplier#000002128,1.770384
1,Supplier#000004020,1.700461
2,Supplier#000004078,1.614691
3,Supplier#000008872,1.597724
4,Supplier#000005228,1.576671


In [51]:
%%pydough

supplier_discounts = lines.CALCULATE(
    discount,
    ship_date,
    supplier_id=supplier_key,
    supplier_name=supplier.name,
    disc94=KEEP_IF(discount, YEAR(ship_date) == 1994),
    disc97=KEEP_IF(discount, YEAR(ship_date) == 1997) 
)

avg_discount_per_supplier = PARTITION(supplier_discounts, name="s", by=supplier_name).CALCULATE(
    supplier_name=supplier_name,
    avg_disc94=AVG(s.disc94),
    avg_disc97=AVG(s.disc97)
)

top_suppliers = avg_discount_per_supplier.CALCULATE(
    supplier_name, 
    pct_change=100 * (avg_disc97 - avg_disc94)
).TOP_K(5, by=pct_change.DESC())

pydough.to_df(top_suppliers)


Unnamed: 0,supplier_name,pct_change
0,Supplier#000002128,1.770384
1,Supplier#000004020,1.700461
2,Supplier#000004078,1.614691
3,Supplier#000008872,1.597724
4,Supplier#000005228,1.576671


### 9. For every year, identify the percentage of all revenue generated that year was from repeat customers who have made a previous purchase from the same supplier.

In [40]:
query = '''
WITH OrderRevenue AS (
    SELECT
        CAST(strftime('%Y', O_ORDERDATE) AS INTEGER) AS year,
        L_SUPPKEY,
        O_CUSTKEY,
        L_EXTENDEDPRICE * (1 - L_DISCOUNT) AS revenue,
        ROW_NUMBER() OVER (PARTITION BY O_CUSTKEY, L_SUPPKEY ORDER BY O_ORDERDATE) AS rn
    FROM LINEITEM
    JOIN ORDERS ON L_ORDERKEY = O_ORDERKEY
)

SELECT 
    year, 
    100.0 * SUM(CASE WHEN rn > 1 THEN revenue ELSE 0 END) / SUM(revenue) AS pct_repeat_revenue
FROM OrderRevenue
GROUP BY year
ORDER BY year;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,pct_repeat_revenue
0,1992,0.069894
1,1993,0.163859
2,1994,0.274796
3,1995,0.373852
4,1996,0.469688
5,1997,0.564944
6,1998,0.65058


In [60]:
%%pydough

lines_with_orders = lines.CALCULATE(
    order_year=YEAR(order.order_date),
    supplier_id=supplier_key,
    customer_id=order.customer_key,
    revenue=extended_price * (1 - discount)  
)

order_revenue = PARTITION(lines_with_orders, name="l", by=(customer_id, supplier_id)).CALCULATE(
    order_year=MAX(l.order_year),
    supplier_id=supplier_id,
    customer_id=customer_id,
    revenue=MAX(l.revenue),
    order_count=COUNT(l.order_key) 
)

revenue_data = PARTITION(order_revenue, name="r", by=order_year).CALCULATE(
    year=order_year,
    total_revenue=SUM(r.revenue),
    repeat_revenue=SUM(IFF(r.order_count > 1, r.revenue, 0))
)

pct_repeat_revenue = revenue_data.CALCULATE(
    year=year,
    pct_repeat_revenue=100.0 * repeat_revenue / total_revenue
).ORDER_BY(year.ASC())

pydough.to_df(pct_repeat_revenue)


Unnamed: 0,year,pct_repeat_revenue
0,1992,0.095209
1,1993,0.21997
2,1994,0.37521
3,1995,0.499009
4,1996,0.633717
5,1997,0.762335
6,1998,0.88152


### 10. Identify the 4 suppliers who have the highest total revenue generated by repeat customers who have already made a purchase from them. Include the suppliers' names, the repeat revenue, and the percentage of their total revenue that is from the repeat revenue.

In [None]:
%%pydough

supps = supply_records.CALCULATE(supp_name= supplier.name, part_brand = part.brand)

total_parts_bysupp = PARTITION(supply_records, name = 'total', by = supplier.name).CALCULATE(
    total_supp_name = supplier.name, 
    part_brand = part.brand, 
    total_amount= COUNT(total.part))


pydough.to_df(total_parts_bysupp)

PyDoughQDAGException: Unrecognized term of simple table collection 'supply_records' in graph 'TPCH': 'name'

In [None]:
query = '''
SELECT s_name, SUM(repeat_revenue) as repeat_revenue, 100.0 * SUM(repeat_revenue) / SUM(revenue) as pct_repeat_revenue
FROM supplier S
INNER JOIN (
    SELECT
        l_suppkey,
        l_extendedprice * (1-l_discount) AS revenue, 
        IFF(ROW_NUMBER() OVER (PARTITION BY o_custkey, l_suppkey ORDER BY o_orderdate) > 1, revenue, 0) AS repeat_revenue
    FROM lineitem, orders,
    WHERE l_orderkey = o_orderkey
) T
ON S.s_suppkey = T.l_suppkey
GROUP BY s_name
ORDER BY pct_repeat_revenue DESC
LIMIT 10
;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

In [44]:
%%pydough

# Step 1: Compute order revenue per line item before partitioning
order_revenue = lines.CALCULATE(
    supplier_id=supplier_key,
    customer_id=order.customer_key,
    order_year=YEAR(order.order_date),
    revenue=extended_price * (1 - discount)  # Compute revenue per order line
)


pydough.to_df(order_revenue)


Unnamed: 0,supplier_id,customer_id,order_year,revenue
0,7706,36901,1996,20321.500800
1,7311,36901,1996,41844.675600
2,3701,36901,1996,11978.640000
3,4633,36901,1996,26349.632400
4,1534,36901,1996,20542.032000
...,...,...,...,...
6001210,2273,113398,1993,35095.075200
6001211,1453,113398,1993,9128.784000
6001212,2138,113398,1993,18457.286400
6001213,2256,110063,1996,5698.800000


### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

### .

In [None]:
query = '''

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

## CHECK TPCH BECHMARK

### 1

In [46]:
query = '''
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE('1998-12-01', '-90 days')
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,L_RETURNFLAG,L_LINESTATUS,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
0,A,F,37734107,56586554400.73,53758257134.87,55909065222.82769,25.522006,38273.129735,0.049985,1478493
1,N,F,991417,1487504710.38,1413082168.0541,1469649223.194375,25.516472,38284.467761,0.050093,38854
2,N,O,74476040,111701729697.74,106118230307.6056,110367043872.497,25.502227,38249.117989,0.049997,2920374
3,R,F,37719753,56568041380.9,53741292684.604,55889619119.831924,25.505794,38250.854626,0.050009,1478870


In [60]:
%%pydough

disc_price = l.extended_price * (1 - l.discount)
charge = disc_price * (1 + l.tax)
selected_lines = lines.WHERE((ship_date <= datetime.date(1998, 9, 2)))
partitioned_lines = PARTITION(selected_lines, name="l", by=(return_flag, status))
output = partitioned_lines.CALCULATE(
    L_RETURNFLAG=return_flag,
    L_LINESTATUS=status,
    SUM_QTY=SUM(l.quantity),
    SUM_BASE_PRICE=SUM(l.extended_price),
    SUM_DISC_PRICE=SUM(disc_price),
    SUM_CHARGE=SUM(charge),
    AVG_QTY=AVG(l.quantity),
    AVG_PRICE=AVG(l.extended_price),
    AVG_DISC=AVG(l.discount),
    COUNT_ORDER=COUNT(l),
).ORDER_BY(return_flag.ASC(), status.ASC())
pydough.to_df(output)

Unnamed: 0,L_RETURNFLAG,L_LINESTATUS,SUM_QTY,SUM_BASE_PRICE,SUM_DISC_PRICE,SUM_CHARGE,AVG_QTY,AVG_PRICE,AVG_DISC,COUNT_ORDER
0,A,F,37734107,56586554400.73,53758257134.87,55909065222.82769,25.522006,38273.129735,0.049985,1478493
1,N,F,991417,1487504710.38,1413082168.0541,1469649223.194375,25.516472,38284.467761,0.050093,38854
2,N,O,74476040,111701729697.74,106118230307.6056,110367043872.497,25.502227,38249.117989,0.049997,2920374
3,R,F,37719753,56568041380.9,53741292684.604,55889619119.831924,25.505794,38250.854626,0.050009,1478870


### 2

In [47]:
query = '''
SELECT
    S_ACCTBAL,
    S_NAME,
    N_NAME,
    P_PARTKEY,
    P_MFGR,
    S_ADDRESS,
    S_PHONE,
    S_COMMENT
FROM
    PART,
    SUPPLIER,
    PARTSUPP,
    NATION,
    REGION
WHERE
    P_PARTKEY = PS_PARTKEY
    AND S_SUPPKEY = PS_SUPPKEY
    AND P_SIZE = 15
    AND P_TYPE LIKE '%BRASS'
    AND S_NATIONKEY = N_NATIONKEY
    AND N_REGIONKEY = R_REGIONKEY
    AND R_NAME = 'EUROPE'
    AND PS_SUPPLYCOST = (
        SELECT MIN(PS_SUPPLYCOST)
        FROM PARTSUPP, SUPPLIER, NATION, REGION
        WHERE P_PARTKEY = PS_PARTKEY
          AND S_SUPPKEY = PS_SUPPKEY
          AND S_NATIONKEY = N_NATIONKEY
          AND N_REGIONKEY = R_REGIONKEY
          AND R_NAME = 'EUROPE'
    )
ORDER BY
    S_ACCTBAL DESC,
    N_NAME,
    S_NAME,
    P_PARTKEY
LIMIT 100;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_ACCTBAL,S_NAME,N_NAME,P_PARTKEY,P_MFGR,S_ADDRESS,S_PHONE,S_COMMENT
0,9938.530000,Supplier#000005359,UNITED KINGDOM,185358,Manufacturer#4,"QKuHYh,vZGiwu2FWEJoLDx04",33-429-790-6131,uriously regular requests hag
1,9937.840000,Supplier#000005969,ROMANIA,108438,Manufacturer#1,"ANDENSOSmk,miq23Xfb5RWt6dvUcvt6Qa",29-520-692-3537,efully express instructions. regular requests ...
2,9936.220000,Supplier#000005250,UNITED KINGDOM,249,Manufacturer#4,B3rqp0xbSEim4Mpy2RH J,33-320-228-2957,etect about the furiously final accounts. slyl...
3,9923.770000,Supplier#000002324,GERMANY,29821,Manufacturer#4,y3OD9UywSTOk,17-779-299-1839,ackages boost blithely. blithely regular depos...
4,9871.220000,Supplier#000006373,GERMANY,43868,Manufacturer#5,J8fcXWsTqM,17-813-485-8637,etect blithely bold asymptotes. fluffily ironi...
...,...,...,...,...,...,...,...,...
95,7887.080000,Supplier#000009792,GERMANY,164759,Manufacturer#3,Y28ITVeYriT3kIGdV2K8fSZ V2UqT5H1Otz,17-988-938-4296,ckly around the carefully fluffy theodolites. ...
96,7871.500000,Supplier#000007206,RUSSIA,104695,Manufacturer#1,3w fNCnrVmvJjE95sgWZzvW,32-432-452-7731,ironic requests. furiously final theodolites c...
97,7852.450000,Supplier#000005864,RUSSIA,8363,Manufacturer#4,"WCNfBPZeSXh3h,c",32-454-883-3821,usly unusual pinto beans. brave ideas sleep ca...
98,7850.660000,Supplier#000001518,UNITED KINGDOM,86501,Manufacturer#1,ONda3YJiHKJOC,33-730-383-3892,ifts haggle fluffily pending pai


In [61]:
%%pydough

selected_parts = (
        nations.CALCULATE(n_name=name)
        .WHERE(region.name == "EUROPE")
        .suppliers.CALCULATE(
            s_acctbal=account_balance,
            s_name=name,
            s_address=address,
            s_phone=phone,
            s_comment=comment,
        )
        .supply_records.CALCULATE(
            supplycost=supplycost,
        )
        .part.WHERE(ENDSWITH(part_type, "BRASS") & (size == 15))
    )
part_groups = PARTITION(selected_parts, name="p", by=key).CALCULATE(
    best_cost=MIN(p.supplycost)
)
output = part_groups.p.WHERE(
    (supplycost == best_cost)
    & ENDSWITH(part_type, "BRASS")
    & (size == 15)
).CALCULATE(
    S_ACCTBAL=s_acctbal,
    S_NAME=s_name,
    N_NAME=n_name,
    P_PARTKEY=key,
    P_MFGR=manufacturer,
    S_ADDRESS=s_address,
    S_PHONE=s_phone,
    S_COMMENT=s_comment,
).TOP_K(
    100,
    by=(S_ACCTBAL.DESC(), N_NAME.ASC(), S_NAME.ASC(), P_PARTKEY.ASC()),
)
pydough.to_df(output)

Unnamed: 0,S_ACCTBAL,S_NAME,N_NAME,P_PARTKEY,P_MFGR,S_ADDRESS,S_PHONE,S_COMMENT
0,9938.530000,Supplier#000005359,UNITED KINGDOM,185358,Manufacturer#4,"QKuHYh,vZGiwu2FWEJoLDx04",33-429-790-6131,uriously regular requests hag
1,9937.840000,Supplier#000005969,ROMANIA,108438,Manufacturer#1,"ANDENSOSmk,miq23Xfb5RWt6dvUcvt6Qa",29-520-692-3537,efully express instructions. regular requests ...
2,9936.220000,Supplier#000005250,UNITED KINGDOM,249,Manufacturer#4,B3rqp0xbSEim4Mpy2RH J,33-320-228-2957,etect about the furiously final accounts. slyl...
3,9923.770000,Supplier#000002324,GERMANY,29821,Manufacturer#4,y3OD9UywSTOk,17-779-299-1839,ackages boost blithely. blithely regular depos...
4,9871.220000,Supplier#000006373,GERMANY,43868,Manufacturer#5,J8fcXWsTqM,17-813-485-8637,etect blithely bold asymptotes. fluffily ironi...
...,...,...,...,...,...,...,...,...
95,7887.080000,Supplier#000009792,GERMANY,164759,Manufacturer#3,Y28ITVeYriT3kIGdV2K8fSZ V2UqT5H1Otz,17-988-938-4296,ckly around the carefully fluffy theodolites. ...
96,7871.500000,Supplier#000007206,RUSSIA,104695,Manufacturer#1,3w fNCnrVmvJjE95sgWZzvW,32-432-452-7731,ironic requests. furiously final theodolites c...
97,7852.450000,Supplier#000005864,RUSSIA,8363,Manufacturer#4,"WCNfBPZeSXh3h,c",32-454-883-3821,usly unusual pinto beans. brave ideas sleep ca...
98,7850.660000,Supplier#000001518,UNITED KINGDOM,86501,Manufacturer#1,ONda3YJiHKJOC,33-730-383-3892,ifts haggle fluffily pending pai


### 3

In [None]:
query = '''
SELECT
    L_ORDERKEY,
    SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)) AS REVENUE,
    O_ORDERDATE,
    O_SHIPPRIORITY
FROM
    CUSTOMER
JOIN ORDERS ON CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY
JOIN LINEITEM ON ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
WHERE
    C_MKTSEGMENT = 'BUILDING'
    AND O_ORDERDATE < DATE('1995-03-15')
    AND L_SHIPDATE > DATE('1995-03-15')
GROUP BY
    L_ORDERKEY,
    O_ORDERDATE,
    O_SHIPPRIORITY
ORDER BY
    REVENUE DESC,
    O_ORDERDATE
LIMIT 10;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,L_ORDERKEY,REVENUE,O_ORDERDATE,O_SHIPPRIORITY
0,2456423,406181.0111,1995-03-05,0
1,3459808,405838.6989,1995-03-04,0
2,492164,390324.061,1995-02-19,0
3,1188320,384537.9359,1995-03-09,0
4,2435712,378673.0558,1995-02-26,0
5,4878020,378376.7952,1995-03-12,0
6,5521732,375153.9215,1995-03-13,0
7,2628192,373133.3094,1995-02-22,0
8,993600,371407.4595,1995-03-05,0
9,2300070,367371.1452,1995-03-13,0


In [63]:
%%pydough

cutoff_date = datetime.date(1995, 3, 15)
selected_orders = orders.CALCULATE(
    order_date, ship_priority
).WHERE(
    (customer.mktsegment == "BUILDING") & (order_date < cutoff_date)
)
selected_lines = selected_orders.lines.WHERE(ship_date > cutoff_date)
output = PARTITION(
    selected_lines, name="l", by=(order_key, order_date, ship_priority)
).CALCULATE(
    L_ORDERKEY=order_key,
    REVENUE=SUM(l.extended_price * (1 - l.discount)),
    O_ORDERDATE=order_date,
    O_SHIPPRIORITY=ship_priority,
).TOP_K(10, by=(REVENUE.DESC(), O_ORDERDATE.ASC(), L_ORDERKEY.ASC()))
pydough.to_df(output)

Unnamed: 0,L_ORDERKEY,REVENUE,O_ORDERDATE,O_SHIPPRIORITY
0,2456423,406181.0111,1995-03-05,0
1,3459808,405838.6989,1995-03-04,0
2,492164,390324.061,1995-02-19,0
3,1188320,384537.9359,1995-03-09,0
4,2435712,378673.0558,1995-02-26,0
5,4878020,378376.7952,1995-03-12,0
6,5521732,375153.9215,1995-03-13,0
7,2628192,373133.3094,1995-02-22,0
8,993600,371407.4595,1995-03-05,0
9,2300070,367371.1452,1995-03-13,0


### 4

In [51]:
query = '''
SELECT
    O_ORDERPRIORITY,
    COUNT(*) AS ORDER_COUNT
FROM
    ORDERS
WHERE
    O_ORDERDATE >= DATE('1993-07-01')
    AND O_ORDERDATE < DATE('1993-10-01')
    AND EXISTS (
        SELECT 1
        FROM LINEITEM
        WHERE LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY
          AND L_COMMITDATE < L_RECEIPTDATE
    )
GROUP BY
    O_ORDERPRIORITY
ORDER BY
    O_ORDERPRIORITY;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,O_ORDERPRIORITY,ORDER_COUNT
0,1-URGENT,10594
1,2-HIGH,10476
2,3-MEDIUM,10410
3,4-NOT SPECIFIED,10556
4,5-LOW,10487


In [64]:
%%pydough

selected_lines = lines.WHERE(commit_date < receipt_date)
selected_orders = orders.WHERE(
    (order_date >= datetime.date(1993, 7, 1))
    & (order_date < datetime.date(1993, 10, 1))
    & HAS(selected_lines)
)
output = PARTITION(selected_orders, name="o", by=order_priority).CALCULATE(
    O_ORDERPRIORITY=order_priority,
    ORDER_COUNT=COUNT(o),
).ORDER_BY(O_ORDERPRIORITY.ASC())
pydough.to_df(output)

Unnamed: 0,O_ORDERPRIORITY,ORDER_COUNT
0,1-URGENT,10594
1,2-HIGH,10476
2,3-MEDIUM,10410
3,4-NOT SPECIFIED,10556
4,5-LOW,10487


### 5

In [53]:
query = '''
SELECT
    n_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue
FROM
    customer,
    orders,
    lineitem,
    supplier,
    nation,
    region
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND l_suppkey = s_suppkey
    AND c_nationkey = s_nationkey
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
    AND r_name = 'AMERICA'
    AND o_orderdate >= DATE('1994-01-01')
    AND o_orderdate < DATE('1995-01-01')
GROUP BY
    n_name
ORDER BY
    revenue DESC
LIMIT 1;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,N_NAME,revenue
0,PERU,56206762.5035


In [65]:
%%pydough

selected_lines = (
    customers.orders.WHERE(
        (order_date >= datetime.date(1994, 1, 1))
        & (order_date < datetime.date(1995, 1, 1))
    )
    .lines.WHERE(supplier.nation.name == nation_name)
    .CALCULATE(value=extended_price * (1 - discount))
)
output = (
    nations.CALCULATE(nation_name=name)
    .WHERE(region.name == "ASIA")
    .CALCULATE(N_NAME=name, REVENUE=SUM(selected_lines.value))
    .ORDER_BY(REVENUE.DESC())
)
pydough.to_df(output)

Unnamed: 0,N_NAME,REVENUE
0,INDONESIA,55502041.1697
1,VIETNAM,55295086.9967
2,CHINA,53724494.2566
3,INDIA,52035512.0002
4,JAPAN,45410175.6954


### 6

In [56]:
query = '''
SELECT
    SUM(L_EXTENDEDPRICE * L_DISCOUNT) AS REVENUE
FROM
    LINEITEM
WHERE
    L_SHIPDATE >= DATE('1994-01-01')
    AND L_SHIPDATE < DATE('1995-01-01')
    AND L_DISCOUNT BETWEEN 0.05 AND 0.07
    AND L_QUANTITY < 24;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,REVENUE
0,123141078.2283


In [66]:
%%pydough

selected_lines = lines.WHERE(
    (ship_date >= datetime.date(1994, 1, 1))
    & (ship_date < datetime.date(1995, 1, 1))
    & (0.05 <= discount)
    & (discount <= 0.07)
    & (quantity < 24)
).CALCULATE(amt=extended_price * discount)
output = TPCH.CALCULATE(REVENUE=SUM(selected_lines.amt))
pydough.to_df(output)

Unnamed: 0,REVENUE
0,123141078.2283


### 7

In [None]:
query = '''
SELECT
    SUPP_NATION,
    CUST_NATION,
    L_YEAR,
    SUM(VOLUME) AS REVENUE
FROM (
    SELECT
        N1.N_NAME AS SUPP_NATION,
        N2.N_NAME AS CUST_NATION,
        CAST(strftime('%Y', L_SHIPDATE) AS INTEGER) AS L_YEAR,
        L_EXTENDEDPRICE * (1 - L_DISCOUNT) AS VOLUME
    FROM
        SUPPLIER
    JOIN LINEITEM ON SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY
    JOIN ORDERS ON ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
    JOIN CUSTOMER ON CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY
    JOIN NATION N1 ON SUPPLIER.S_NATIONKEY = N1.N_NATIONKEY
    JOIN NATION N2 ON CUSTOMER.C_NATIONKEY = N2.N_NATIONKEY
    WHERE
        (
            (N1.N_NAME = 'FRANCE' AND N2.N_NAME = 'GERMANY')
            OR (N1.N_NAME = 'GERMANY' AND N2.N_NAME = 'FRANCE')
        )
        AND L_SHIPDATE BETWEEN DATE('1995-01-01') AND DATE('1996-12-31')
) SHIPPING
GROUP BY
    SUPP_NATION,
    CUST_NATION,
    L_YEAR
ORDER BY
    SUPP_NATION,
    CUST_NATION,
    L_YEAR;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,SUPP_NATION,CUST_NATION,L_YEAR,REVENUE
0,FRANCE,GERMANY,1995,54639732.7336
1,FRANCE,GERMANY,1996,54633083.3076
2,GERMANY,FRANCE,1995,52531746.6697
3,GERMANY,FRANCE,1996,52520549.0224


In [67]:
%%pydough

line_info = lines.CALCULATE(
    supp_nation=supplier.nation.name,
    cust_nation=order.customer.nation.name,
    l_year=YEAR(ship_date),
    volume=extended_price * (1 - discount),
).WHERE(
    (ship_date >= datetime.date(1995, 1, 1))
    & (ship_date <= datetime.date(1996, 12, 31))
    & (
        ((supp_nation == "FRANCE") & (cust_nation == "GERMANY"))
        | ((supp_nation == "GERMANY") & (cust_nation == "FRANCE"))
    )
)

output = PARTITION(line_info, name="l", by=(supp_nation, cust_nation, l_year)).CALCULATE(
    SUPP_NATION=supp_nation,
    CUST_NATION=cust_nation,
    L_YEAR=l_year,
    REVENUE=SUM(l.volume),
).ORDER_BY(
    SUPP_NATION.ASC(),
    CUST_NATION.ASC(),
    L_YEAR.ASC(),
)
pydough.to_df(output)

Unnamed: 0,SUPP_NATION,CUST_NATION,L_YEAR,REVENUE
0,FRANCE,GERMANY,1995,54639732.7336
1,FRANCE,GERMANY,1996,54633083.3076
2,GERMANY,FRANCE,1995,52531746.6697
3,GERMANY,FRANCE,1996,52520549.0224


### 8

In [42]:
query = '''
SELECT
    O_YEAR,
    SUM(CASE
            WHEN NATION = 'BRAZIL' THEN VOLUME
            ELSE 0
        END) / SUM(VOLUME) AS MKT_SHARE
FROM ( 
    SELECT
        CAST(strftime('%Y', O.O_ORDERDATE) AS INTEGER) AS O_YEAR,
        L.L_EXTENDEDPRICE * (1 - L.L_DISCOUNT) AS VOLUME,
        N2.N_NAME AS NATION
    FROM
        PART P
    JOIN LINEITEM L ON P.P_PARTKEY = L.L_PARTKEY
    JOIN SUPPLIER S ON S.S_SUPPKEY = L.L_SUPPKEY
    JOIN ORDERS O ON O.O_ORDERKEY = L.L_ORDERKEY
    JOIN CUSTOMER C ON C.C_CUSTKEY = O.O_CUSTKEY
    JOIN NATION N1 ON C.C_NATIONKEY = N1.N_NATIONKEY
    JOIN REGION R ON N1.N_REGIONKEY = R.R_REGIONKEY
    JOIN NATION N2 ON S.S_NATIONKEY = N2.N_NATIONKEY
    WHERE
        R.R_NAME = 'AMERICA'
        AND O.O_ORDERDATE BETWEEN DATE('1995-01-01') AND DATE('1996-12-31')
        AND P.P_TYPE = 'ECONOMY ANODIZED STEEL'
) ALL_NATIONS
GROUP BY
    O_YEAR
ORDER BY
    O_YEAR;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,O_YEAR,MKT_SHARE
0,1995,0.034436
1,1996,0.041486


In [44]:
%%pydough

volume_data = (
    nations.CALCULATE(nation_name=name)
    .suppliers.supply_records.WHERE(part.part_type == "ECONOMY ANODIZED STEEL")
    .lines.CALCULATE(volume=extended_price * (1 - discount))
    .order.CALCULATE(
        o_year=YEAR(order_date),
        brazil_volume=IFF(nation_name == "BRAZIL", volume, 0),
    )
    .WHERE(
        (order_date >= datetime.date(1995, 1, 1))
        & (order_date <= datetime.date(1996, 12, 31))
        & (customer.nation.region.name == "AMERICA")
    )
)

output = PARTITION(volume_data, name="v", by=o_year).CALCULATE(
    O_YEAR=o_year,
    MKT_SHARE=SUM(v.brazil_volume) / SUM(v.volume),
)

pydough.to_df(output)

Unnamed: 0,O_YEAR,MKT_SHARE
0,1995,0.034436
1,1996,0.041486


### 9

In [69]:
query = '''
SELECT
    NATION,
    O_YEAR,
    SUM(AMOUNT) AS SUM_PROFIT
FROM (
    SELECT
        N_NAME AS NATION,
        CAST(strftime('%Y', O_ORDERDATE) AS INTEGER) AS O_YEAR,
        L_EXTENDEDPRICE * (1 - L_DISCOUNT) - PS_SUPPLYCOST * L_QUANTITY AS AMOUNT
    FROM
        PART
    JOIN LINEITEM ON PART.P_PARTKEY = LINEITEM.L_PARTKEY
    JOIN PARTSUPP ON PARTSUPP.PS_PARTKEY = LINEITEM.L_PARTKEY 
                 AND PARTSUPP.PS_SUPPKEY = LINEITEM.L_SUPPKEY
    JOIN SUPPLIER ON SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY
    JOIN ORDERS ON ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
    JOIN NATION ON SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY
    WHERE
        PART.P_NAME LIKE '%green%'
) PROFIT
GROUP BY
    NATION,
    O_YEAR
ORDER BY
    NATION,
    O_YEAR;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,NATION,O_YEAR,SUM_PROFIT
0,ALGERIA,1992,45636849.488100
1,ALGERIA,1993,46044207.783800
2,ALGERIA,1994,48694008.066800
3,ALGERIA,1995,44402273.599900
4,ALGERIA,1996,48285482.678200
...,...,...,...
170,VIETNAM,1994,47729256.332400
171,VIETNAM,1995,48235135.801600
172,VIETNAM,1996,47824595.904000
173,VIETNAM,1997,48735914.179600


### 10

In [71]:
query = '''
SELECT
    C_CUSTKEY,
    C_NAME,
    SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)) AS REVENUE,
    C_ACCTBAL,
    N_NAME,
    C_ADDRESS,
    C_PHONE,
    C_COMMENT
FROM
    CUSTOMER
JOIN ORDERS ON CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY
JOIN LINEITEM ON ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
JOIN NATION ON CUSTOMER.C_NATIONKEY = NATION.N_NATIONKEY
WHERE
    O_ORDERDATE >= DATE('1993-10-01')
    AND O_ORDERDATE < DATE('1994-01-01')
    AND L_RETURNFLAG = 'R'
GROUP BY
    C_CUSTKEY,
    C_NAME,
    C_ACCTBAL,
    C_PHONE,
    N_NAME,
    C_ADDRESS,
    C_COMMENT
ORDER BY
    REVENUE DESC
LIMIT 20;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_CUSTKEY,C_NAME,REVENUE,C_ACCTBAL,N_NAME,C_ADDRESS,C_PHONE,C_COMMENT
0,57040,Customer#000057040,734235.2455,632.87,JAPAN,Eioyzjf4pp,22-895-641-3466,sits. slyly regular requests sleep alongside o...
1,143347,Customer#000143347,721002.6948,2557.47,EGYPT,"1aReFYv,Kw4",14-742-935-3718,ggle carefully enticing requests. final deposi...
2,60838,Customer#000060838,679127.3077,2454.77,BRAZIL,64EaJ5vMAHWJlBOxJklpNc2RJiWE,12-913-494-9813,need to boost against the slyly regular account
3,101998,Customer#000101998,637029.5667,3790.89,UNITED KINGDOM,01c9CILnNtfOQYmZj,33-593-865-6378,ress foxes wake slyly after the bold excuses. ...
4,125341,Customer#000125341,633508.086,4983.51,GERMANY,S29ODD6bceU8QSuuEJznkNaK,17-582-695-5962,arefully even depths. blithely even excuses sl...
5,25501,Customer#000025501,620269.7849,7725.04,ETHIOPIA,"W556MXuoiaYCCZamJI,Rn0B4ACUGdkQ8DZ",15-874-808-6793,he pending instructions wake carefully at the ...
6,115831,Customer#000115831,596423.8672,5098.1,FRANCE,rFeBbEEyk dl ne7zV5fDrmiq1oK09wV7pxqCgIc,16-715-386-3788,l somas sleep. furiously final deposits wake b...
7,84223,Customer#000084223,594998.0239,528.65,UNITED KINGDOM,nAVZCs6BaWap rrM27N 2qBnzc5WBauxbA,33-442-824-8191,"slyly final deposits haggle regular, pending ..."
8,54289,Customer#000054289,585603.3918,5583.02,IRAN,"vXCxoCsU0Bad5JQI ,oobkZ",20-834-292-4707,ely special foxes are quickly finally ironic p
9,39922,Customer#000039922,584878.1134,7321.11,GERMANY,Zgy4s50l2GKN4pLDPBU8m342gIw6R,17-147-757-8036,y final requests. furiously final foxes cajole...


### 11

In [72]:
query = '''
SELECT
    PS_PARTKEY,
    SUM(PS_SUPPLYCOST * PS_AVAILQTY) AS VALUE
FROM
    PARTSUPP,
    SUPPLIER,
    NATION
WHERE
    PS_SUPPKEY = S_SUPPKEY
    AND S_NATIONKEY = N_NATIONKEY
    AND N_NAME = 'GERMANY'
GROUP BY
    PS_PARTKEY
HAVING
    SUM(PS_SUPPLYCOST * PS_AVAILQTY) > (
        SELECT
            SUM(PS_SUPPLYCOST * PS_AVAILQTY) * 0.0001
        FROM
            PARTSUPP,
            SUPPLIER,
            NATION
        WHERE
            PS_SUPPKEY = S_SUPPKEY
            AND S_NATIONKEY = N_NATIONKEY
            AND N_NAME = 'GERMANY'
    )
ORDER BY
    VALUE DESC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,PS_PARTKEY,VALUE
0,129760,17538456.860000
1,166726,16503353.920000
2,191287,16474801.970000
3,161758,16101755.540000
4,34452,15983844.720000
...,...,...
1043,154731,7888301.330000
1044,101674,7879324.600000
1045,51968,7879102.210000
1046,72073,7877736.110000


### 12

In [74]:
query = '''
SELECT
    L_SHIPMODE,
    SUM(CASE
            WHEN O_ORDERPRIORITY = '1-URGENT'
              OR O_ORDERPRIORITY = '2-HIGH'
            THEN 1
            ELSE 0
        END) AS HIGH_LINE_COUNT,
    SUM(CASE
            WHEN O_ORDERPRIORITY <> '1-URGENT'
              AND O_ORDERPRIORITY <> '2-HIGH'
            THEN 1
            ELSE 0
        END) AS LOW_LINE_COUNT
FROM
    ORDERS
JOIN LINEITEM ON ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
WHERE
    L_SHIPMODE IN ('MAIL', 'SHIP')
    AND L_COMMITDATE < L_RECEIPTDATE
    AND L_SHIPDATE < L_COMMITDATE
    AND L_RECEIPTDATE >= DATE('1994-01-01')
    AND L_RECEIPTDATE < DATE('1995-01-01')
GROUP BY
    L_SHIPMODE
ORDER BY
    L_SHIPMODE;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,L_SHIPMODE,HIGH_LINE_COUNT,LOW_LINE_COUNT
0,MAIL,6202,9324
1,SHIP,6200,9262


### 13

In [75]:
query = '''
SELECT
    C_COUNT,
    COUNT(*) AS CUSTDIST
FROM (
    SELECT
        C_CUSTKEY,
        COUNT(O_ORDERKEY) AS C_COUNT
    FROM
        CUSTOMER
        LEFT OUTER JOIN ORDERS ON C_CUSTKEY = O_CUSTKEY
        AND O_COMMENT NOT LIKE '%special%requests%'
    GROUP BY
        C_CUSTKEY
) C_ORDERS
GROUP BY
    C_COUNT
ORDER BY
    CUSTDIST DESC,
    C_COUNT DESC;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_COUNT,CUSTDIST
0,0,50005
1,9,6641
2,10,6532
3,11,6014
4,8,5937
5,12,5639
6,13,5024
7,19,4793
8,7,4687
9,17,4587


### 14

In [78]:
query = '''
SELECT
    100.00 * SUM(CASE
                     WHEN P_TYPE LIKE 'PROMO%'
                     THEN L_EXTENDEDPRICE * (1 - L_DISCOUNT)
                     ELSE 0
                 END) / SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)) AS PROMO_REVENUE
FROM
    LINEITEM
JOIN PART ON LINEITEM.L_PARTKEY = PART.P_PARTKEY
WHERE
    L_SHIPDATE >= DATE('1995-09-01')
    AND L_SHIPDATE < DATE('1995-10-01');

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,PROMO_REVENUE
0,16.380779


### 15

In [80]:
query = '''
WITH REVENUE AS (
    SELECT
        L_SUPPKEY AS SUPPLIER_NO,
        SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)) AS TOTAL_REVENUE
    FROM
        LINEITEM
    WHERE
        L_SHIPDATE >= DATE('1996-01-01')
        AND L_SHIPDATE < DATE('1996-04-01')
    GROUP BY
        L_SUPPKEY
)

SELECT
    S_SUPPKEY,
    S_NAME,
    S_ADDRESS,
    S_PHONE,
    TOTAL_REVENUE
FROM
    SUPPLIER
JOIN REVENUE ON S_SUPPKEY = SUPPLIER_NO
WHERE
    TOTAL_REVENUE = (SELECT MAX(TOTAL_REVENUE) FROM REVENUE)
ORDER BY
    SUPPLIER_NO;

'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_SUPPKEY,S_NAME,S_ADDRESS,S_PHONE,TOTAL_REVENUE
0,8449,Supplier#000008449,Wp34zim9qYFbVctdW,20-469-856-8873,1772627.2087


### 16

In [4]:
query = '''
select
    p_brand,
    p_type,
    p_size,
    count(distinct ps_suppkey) as supplier_cnt
from
    partsupp,
    part
where
    p_partkey = ps_partkey
    and p_brand <> 'BRAND#45'
    and p_type not like 'MEDIUM POLISHED%'
    and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
    and ps_suppkey not in (
        select
            s_suppkey
        from
            supplier
        where
            s_comment like '%Customer%Complaints%'
    )
group by
    p_brand,
    p_type,
    p_size
order by
    supplier_cnt desc,
    p_brand,
    p_type,
    p_size;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,P_BRAND,P_TYPE,P_SIZE,supplier_cnt
0,Brand#41,MEDIUM BRUSHED TIN,3,28
1,Brand#54,STANDARD BRUSHED COPPER,14,27
2,Brand#11,STANDARD BRUSHED TIN,23,24
3,Brand#11,STANDARD BURNISHED BRASS,36,24
4,Brand#15,MEDIUM ANODIZED NICKEL,3,24
...,...,...,...,...
19081,Brand#52,MEDIUM BRUSHED BRASS,45,3
19082,Brand#53,MEDIUM BRUSHED TIN,45,3
19083,Brand#54,ECONOMY POLISHED BRASS,9,3
19084,Brand#55,PROMO PLATED BRASS,19,3


### 17

In [5]:
query = '''
WITH AvgQuantity AS (
    SELECT 
        L_PARTKEY, 
        0.2 * AVG(L_QUANTITY) AS Threshold
    FROM LINEITEM
    GROUP BY L_PARTKEY
)

SELECT
    SUM(L_EXTENDEDPRICE) / 7.0 AS AVG_YEARLY
FROM
    LINEITEM
JOIN PART ON PART.P_PARTKEY = LINEITEM.L_PARTKEY
JOIN AvgQuantity AQ ON LINEITEM.L_PARTKEY = AQ.L_PARTKEY
WHERE
    P_BRAND = 'Brand#23'
    AND P_CONTAINER = 'MED BOX'
    AND L_QUANTITY < AQ.Threshold;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,AVG_YEARLY
0,348406.054286


In [6]:
%%pydough

selected_lines = parts.WHERE(
    (brand == "Brand#23") & (container == "MED BOX")
).CALCULATE(
    avg_quantity=AVG(lines.quantity)
).lines.WHERE(quantity < 0.2 * avg_quantity)
output = TPCH.CALCULATE(AVG_YEARLY=SUM(selected_lines.extended_price) / 7.0)
pydough.to_df(output)

Unnamed: 0,AVG_YEARLY
0,348406.054286


### 18

In [5]:
query = '''
select
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    sum(l_quantity)
from
    customer,
    orders,
    lineitem
where
    o_orderkey in (
        select
            l_orderkey
        from
            lineitem
        group by
            l_orderkey 
        having
            sum(l_quantity) > 300
        )
    and c_custkey = o_custkey
    and o_orderkey = l_orderkey
group by
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice
order by
    o_totalprice desc,
    o_orderdate;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,C_NAME,C_CUSTKEY,O_ORDERKEY,O_ORDERDATE,O_TOTALPRICE,sum(l_quantity)
0,Customer#000128120,128120,4722021,1994-04-07,544089.09,323
1,Customer#000144617,144617,3043270,1997-02-12,530604.44,317
2,Customer#000013940,13940,2232932,1997-04-13,522720.61,304
3,Customer#000066790,66790,2199712,1996-09-30,515531.82,327
4,Customer#000046435,46435,4745607,1997-07-03,508047.99,309
5,Customer#000015272,15272,3883783,1993-07-28,500241.33,302
6,Customer#000146608,146608,3342468,1994-06-12,499794.58,303
7,Customer#000096103,96103,5984582,1992-03-16,494398.79,312
8,Customer#000024341,24341,1474818,1992-11-15,491348.26,302
9,Customer#000137446,137446,5489475,1997-05-23,487763.25,311


### 19

In [6]:
query = '''
select
    sum(l_extendedprice * (1 - l_discount) ) as revenue
from
    lineitem,
    part
where
    (
        p_partkey = l_partkey
        and p_brand = 'Brand#12'
        and p_container in ( 'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
        and l_quantity >= 1 and l_quantity <= 1 + 10
        and p_size between 1 and 5
        and l_shipmode in ('AIR', 'AIR REG')
        and l_shipinstruct = 'DELIVER IN PERSON'
    )
    or
    (
        p_partkey = l_partkey
        and p_brand = 'Brand#23'
        and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
        and l_quantity >= 10 and l_quantity <= 10 + 10
        and p_size between 1 and 10
        and l_shipmode in ('AIR', 'AIR REG')
        and l_shipinstruct = 'DELIVER IN PERSON'
    )
    or
    (
        p_partkey = l_partkey
        and p_brand = 'Brand#34'
        and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
        and l_quantity >= 20 and l_quantity <= 20 + 10
        and p_size between 1 and 15
        and l_shipmode in ('AIR', 'AIR REG')
        and l_shipinstruct = 'DELIVER IN PERSON'
    )
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,revenue
0,3083843.0578


### 20

In [4]:
query = '''
WITH LineitemAgg AS (
    SELECT
        L_PARTKEY,
        L_SUPPKEY,
        0.5 * SUM(L_QUANTITY) AS Threshold
    FROM LINEITEM
    WHERE
        L_SHIPDATE >= DATE('1994-01-01')
        AND L_SHIPDATE < DATE('1995-01-01')
    GROUP BY
        L_PARTKEY,
        L_SUPPKEY
)

SELECT
    S_NAME,
    S_ADDRESS
FROM
    SUPPLIER
JOIN NATION ON SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY
JOIN PARTSUPP ON SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY
JOIN LineitemAgg LIA ON PARTSUPP.PS_PARTKEY = LIA.L_PARTKEY 
                     AND PARTSUPP.PS_SUPPKEY = LIA.L_SUPPKEY
WHERE
    PARTSUPP.PS_AVAILQTY > LIA.Threshold
    AND N_NAME = 'CANADA'
    AND PARTSUPP.PS_PARTKEY IN (
        SELECT P_PARTKEY FROM PART WHERE P_NAME LIKE 'forest%'
    )
ORDER BY
    S_NAME;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,S_NAME,S_ADDRESS
0,Supplier#000000020,"iybAE,RmTymrZVYaFZva2SH,j"
1,Supplier#000000091,YV45D7TkfdQanOOZ7q9QxkyGUapU1oOWU6q3
2,Supplier#000000205,rF uV8d0JNEk
3,Supplier#000000285,Br7e1nnt1yxrw6ImgpJ7YdhFDjuBf
4,Supplier#000000287,7a9SP7qW5Yku5PvSg
...,...,...
258,Supplier#000009812,APFRMy3lCbgFga53n5t9DxzFPQPgnjrGt32
259,Supplier#000009846,"57sNwJJ3PtBDu,hMPP5QvpcOcSNRXn3PypJJrh"
260,Supplier#000009846,"57sNwJJ3PtBDu,hMPP5QvpcOcSNRXn3PypJJrh"
261,Supplier#000009899,"7XdpAHrzr1t,UQFZE"
