## Import libraries.

In [3]:
%load_ext pydough.jupyter_extensions

import pydough

%load_ext pydough.jupyter_extensions

#Necessary for comparison
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import re
import dfcompare

import collections
import numpy as np
import sqlite3 as sql
import os

pd.options.display.float_format = '{:.6f}'.format

The pydough.jupyter_extensions extension is already loaded. To reload it, use:
  %reload_ext pydough.jupyter_extensions
The pydough.jupyter_extensions extension is already loaded. To reload it, use:
  %reload_ext pydough.jupyter_extensions


## Load database.

In [4]:
#YOUR .SQL FILE TO CREATE THE DATABASE, COPY IT TO THIS FOLDER.
SQL_filename = 'car_dealership.sql'

#METADATA FOR THE GRAPH .JSON
metadata_path = "../metadata/car_dealership_graphs.json"

#GRAPH NAME
graph_name = "Dealership"

#DESIRED DATABASE NAME
DB_name = "DATABASE.db"



with open(SQL_filename, 'r') as sql_file:
    sql_script = sql_file.read()

os.remove(DB_name)
connection = sql.connect(DB_name)
cursor = connection.cursor()
cursor.executescript(sql_script)

pydough.active_session.load_metadata_graph(metadata_path, graph_name)
pydough.active_session.connect_database("sqlite", database=DB_name)

DatabaseContext(connection=<pydough.database_connectors.database_connector.DatabaseConnection object at 0x7fdc64c830e0>, dialect=<DatabaseDialect.SQLITE: 'sqlite'>)

## Info

In [366]:
graph = pydough.active_session.metadata

print(pydough.explain_structure(graph))

#print(pydough.explain(graph["Cars"], verbose=True))

Structure of PyDough graph: Dealership

  Cars
  ├── _id
  ├── color
  ├── cost
  ├── crtd_ts
  ├── engine_type
  ├── make
  ├── model
  ├── transmission
  ├── vin_number
  ├── year
  ├── inventory_snapshots [multiple InventorySnapshots] (reverse of InventorySnapshots.car)
  └── sale_records [multiple Sales] (reverse of Sales.car)

  Customers
  ├── _id
  ├── address
  ├── city
  ├── crtd_ts
  ├── email
  ├── first_name
  ├── last_name
  ├── phone
  ├── state
  ├── zip_code
  └── car_purchases [multiple Sales] (reverse of Sales.customer)

  InventorySnapshots
  ├── _id
  ├── car_id
  ├── crtd_ts
  ├── is_in_inventory
  ├── snapshot_date
  └── car [one member of Cars] (reverse of Cars.inventory_snapshots)

  PaymentsReceived
  ├── _id
  ├── crtd_ts
  ├── payment_amount
  ├── payment_date
  ├── payment_method
  ├── sale_id
  └── sale_record [one member of Sales] (reverse of Sales.payment)

  Sales
  ├── _id
  ├── car_id
  ├── crtd_ts
  ├── customer_id
  ├── sale_date
  ├── sale_price
  ├

### Notes: 

Check 5-6, (Not same df) 15-16-25, (Haven't been able to solve) 23, (Not showing 0 or none records).
 
13 (start of week) not implemented.

A table was missing from the metadata: PaymentsMade. 12(PaymentsMade)

# Pydough: Car Dealership Queries.

Below, we demonstrate a series of queries typically executed on the car dealership database.
Each query has been converted from SQL to PyDough manually.

### 1. SQLite BasicQuery 1

Return the car ID, make, model and year for cars that have no sales records. By doing a left join from the cars to sales table.

```SQL
SELECT c.id AS car_id, c.make, c.model, c.year 
FROM cars AS c 
LEFT JOIN sales AS s 
ON c.id = s.car_id 
WHERE s.car_id IS NULL;
```



In [7]:
# Define la consulta SQL en PyDough
query = """
SELECT c._id AS car_id, c.make, c.model, c.year 
FROM cars AS c 
LEFT JOIN sales AS s 
ON c._id = s.car_id 
WHERE s.car_id IS NULL;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,car_id,make,model,year
0,11,Mazda,CX-5,2022
1,12,Hyundai,Tucson,2023
2,13,Kia,Sorento,2021
3,14,Jeep,Wrangler,2022
4,15,GMC,Sierra 1500,2023
5,16,Ram,1500,2022
6,17,Mercedes-Benz,E-Class,2021
7,18,Volkswagen,Tiguan,2022
8,19,Volvo,XC90,2023
9,20,Porsche,911,2022


In [12]:
%%pydough
result = Cars.WHERE(HASNOT(sale_records)).CALCULATE(_id, make, model, year)
pydough.to_df(result)

Unnamed: 0,_id,make,model,year
0,11,Mazda,CX-5,2022
1,12,Hyundai,Tucson,2023
2,13,Kia,Sorento,2021
3,14,Jeep,Wrangler,2022
4,15,GMC,Sierra 1500,2023
5,16,Ram,1500,2022
6,17,Mercedes-Benz,E-Class,2021
7,18,Volkswagen,Tiguan,2022
8,19,Volvo,XC90,2023
9,20,Porsche,911,2022


In [13]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the car ID, make, model and year for cars that have no sales records. By doing a left join from the cars to sales table.")

True

### 2. SQLite BasicQuery 2

Return the distinct list of customer IDs that have made a purchase, based on joining the customers and sales tables.

```SQL
SELECT DISTINCT c.id AS customer_id 
FROM customers  AS c 
JOIN sales  AS s 
ON c.id = s.customer_id;
```



In [36]:
# Define la consulta SQL en PyDough
query = """
SELECT DISTINCT c._id AS customer_id 
FROM customers  AS c 
JOIN sales  AS s 
ON c._id = s.customer_id;
"""
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,customer_id
0,3
1,5
2,2
3,9
4,7
5,1
6,6
7,10
8,8
9,4


In [35]:
%%pydough
result = Customers.WHERE(HAS(car_purchases)).CALCULATE(_id)

pydough.to_df(result)

Unnamed: 0,_id
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [19]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_distinct", question="Return the distinct list of customer IDs that have made a purchase, based on joining the customers and sales tables.")

np.True_

### 3. SQLite BasicQuery 3

Return the distinct list of salesperson IDs that have received a cash payment, based on joining the salespersons, sales and payments_received tables.

```SQL
SELECT DISTINCT s.id AS salesperson_id 
FROM salespersons AS s 
JOIN sales AS sa 
ON s.id = sa.salesperson_id 
JOIN payments_received AS p 
ON sa.id = p.sale_id 
WHERE p.payment_method = 'cash';
```



In [26]:
# Define la consulta SQL en PyDough
query = """
SELECT DISTINCT s._id AS salesperson_id 
FROM salespersons AS s 
JOIN sales AS sa 
ON s._id = sa.salesperson_id 
JOIN payments_received AS p 
ON sa._id = p.sale_id 
WHERE p.payment_method = 'cash';
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,salesperson_id
0,4
1,7
2,1


In [30]:
%%pydough
result = Salespersons.WHERE(
    HAS(
        sales_made.WHERE(
            HAS(payment.WHERE(payment_method == "cash"))
        )
    )
).CALCULATE(salesperson_id=_id)


pydough.to_df(result)

Unnamed: 0,salesperson_id
0,1
1,4
2,7


In [27]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_distinct", question="Return the distinct list of salesperson IDs that have received a cash payment, based on joining the salespersons, sales and payments_received tables.")

np.True_

### 4. SQLite BasicQuery 4

Return the salesperson ID, first name and last name for salespersons that have no sales records, by doing a left join from the salespersons to sales table.

```SQL
SELECT s.id AS salesperson_id, s.first_name, s.last_name 
FROM salespersons AS s 
LEFT JOIN sales AS sa 
ON s.id = sa.salesperson_id 
WHERE sa.salesperson_id IS NULL;
```



In [33]:
# Define la consulta SQL en PyDough
query = """
SELECT s._id AS salesperson_id, s.first_name, s.last_name 
FROM salespersons AS s 
LEFT JOIN sales AS sa 
ON s._id = sa.salesperson_id 
WHERE sa.salesperson_id IS NULL;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,salesperson_id,first_name,last_name
0,5,David,Wilson
1,8,Olivia,Thomas
2,9,James,Jackson
3,10,Sophia,White
4,11,Robert,Johnson
5,12,Jennifer,Davis
6,13,Jessica,Rodriguez


In [32]:
%%pydough
result = Salespersons.WHERE(HASNOT(sales_made)).CALCULATE(_id, first_name, last_name)

pydough.to_df(result)



Unnamed: 0,_id,first_name,last_name
0,5,David,Wilson
1,8,Olivia,Thomas
2,9,James,Jackson
3,10,Sophia,White
4,11,Robert,Johnson
5,12,Jennifer,Davis
6,13,Jessica,Rodriguez


In [34]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the salesperson ID, first name and last name for salespersons that have no sales records, by doing a left join from the salespersons to sales table.")

True

### 5. SQLite BasicQuery 5 (Date)

Return the top 5 salespersons by number of sales in the past 30 days? Return their first and last name, total sales count and total revenue amount.

```SQL
SELECT sp.first_name, sp.last_name, COUNT(s.id) AS total_sales, SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN salespersons AS sp 
ON s.salesperson_id = sp.id 
WHERE s.sale_date >= DATE('now', '-30 days') 
GROUP BY sp.first_name, sp.last_name, sp.id 
ORDER BY total_sales DESC LIMIT 5;
``


In [39]:
# Define la consulta SQL en PyDough
query = """
SELECT sp.first_name, sp.last_name, COUNT(s._id) AS total_sales, SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN salespersons AS sp 
ON s.salesperson_id = sp._id 
WHERE s.sale_date >= DATE('now', '-30 days') 
GROUP BY sp.first_name, sp.last_name, sp._id 
ORDER BY total_sales DESC LIMIT 5;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,total_sales,total_revenue
0,Jane,Smith,3,140000.0
1,Michael,Johnson,2,69700.0
2,Emily,Brown,1,29500.0
3,John,Doe,1,26000.0
4,Sarah,Taylor,1,43500.0


In [432]:
%%pydough 

sales_person_last_month = Salespersons.WHERE(
    HAS(sales_made.WHERE(sale_date >= DATETIME("current_date", "-30 days")))
)

result = sales_person_last_month.CALCULATE(
    first_name,
    last_name,
    total_sales=COUNT(sales_made),
    total_revenue=SUM(sales_made.sale_price)
).TOP_K(5, by=total_sales.DESC())

pydough.to_df(result)



Unnamed: 0,first_name,last_name,total_sales,total_revenue
0,Jane,Smith,6,278000.0
1,John,Doe,5,215000.0
2,Sarah,Taylor,3,116000.0
3,Michael,Johnson,3,96500.0
4,Emily,Brown,3,79500.0


In [48]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_date_group_order_limit", question="Return the top 5 salespersons by number of sales in the past 30 days? Return their first and last name, total sales count and total revenue amount.")

np.False_

### 6. SQLite BasicQuery 6

Return the top 5 states by total revenue, showing the number of unique customers and total revenue (based on sale price) for each state.

```SQL
SELECT c.state, COUNT(DISTINCT s.customer_id) AS unique_customers, SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN customers AS c 
ON s.customer_id = c.id 
GROUP BY c.state 
ORDER BY CASE WHEN total_revenue IS NULL THEN 1 ELSE 0 END DESC, total_revenue DESC LIMIT 5;
```



In [70]:
# Define la consulta SQL en PyDough
query = """
SELECT c.state, COUNT(DISTINCT s.customer_id) AS unique_customers, SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN customers AS c 
ON s.customer_id = c._id 
GROUP BY c.state 
ORDER BY CASE WHEN total_revenue IS NULL THEN 1 ELSE 0 END DESC, total_revenue DESC LIMIT 5;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,state,unique_customers,total_revenue
0,CA,3,257000.0
1,TX,3,198500.0
2,IL,1,124500.0
3,WA,1,90000.0
4,FL,1,54900.0


In [71]:
%%pydough

customers_by_state = PARTITION(Customers, name="custs", by=state)

result = customers_by_state.CALCULATE(state, 
                            unique_customers=COUNT(custs),
                            total_revenue=SUM(custs.car_purchases.sale_price)
).TOP_K(5, by=total_revenue.DESC())

pydough.to_df(result)


Unnamed: 0,state,unique_customers,total_revenue
0,CA,4,257000.0
1,TX,3,198500.0
2,IL,1,124500.0
3,WA,1,90000.0
4,FL,1,54900.0


In [73]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_group_order_limit", question="Return the top 5 states by total revenue, showing the number of unique customers and total revenue (based on sale price) for each state.")

np.False_

### 7. SQLite BasicQuery 7

What are the top 3 payment methods by total payment amount received? Return the payment method, total number of payments and total amount.

```SQL
SELECT payment_method, COUNT(*) AS total_payments, 
SUM(payment_amount) AS total_amount 
FROM payments_received 
GROUP BY payment_method 
ORDER BY CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END DESC, total_amount DESC LIMIT 3;
```



In [24]:
# Define la consulta SQL en PyDough
query = """
SELECT payment_method, COUNT(*) AS total_payments, 
SUM(payment_amount) AS total_amount 
FROM payments_received 
GROUP BY payment_method 
ORDER BY CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END DESC, total_amount DESC LIMIT 3;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,payment_method,total_payments,total_amount
0,credit_card,7,426500.0
1,financing,5,252700.0
2,debit_card,5,216000.0


In [75]:
%%pydough

result = PARTITION(PaymentsReceived, name="p", by=payment_method).CALCULATE(
    payment_method,
    total_payments=COUNT(p._id),
    total_amount=SUM(p.payment_amount), 
).ORDER_BY(
    total_amount.DESC()
).TOP_K(3, by=total_amount.DESC())

pydough.to_df(result)

Unnamed: 0,payment_method,total_payments,total_amount
0,credit_card,7,426500.0
1,financing,5,252700.0
2,debit_card,5,216000.0


In [None]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="What are the top 3 payment methods by total payment amount received? Return the payment method, total number of payments and total amount.")

True

### 8. SQLite BasicQuery 8 

What are the top 5 best selling car models by total revenue? Return the make, model, total number of sales and total revenue.

```SQL
SELECT c.make, c.model, COUNT(s.id) AS total_sales, 
SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN cars AS c 
ON s.car_id = c.id 
GROUP BY c.make, c.model 
ORDER BY CASE WHEN total_revenue IS NULL THEN 1 ELSE 0 END DESC, total_revenue DESC LIMIT 5;

```



In [31]:
# Define la consulta SQL en PyDough
query = """
SELECT c.make, c.model, COUNT(s._id) AS total_sales, 
SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN cars AS c 
ON s.car_id = c._id 
GROUP BY c.make, c.model 
ORDER BY CASE WHEN total_revenue IS NULL THEN 1 ELSE 0 END DESC, total_revenue DESC LIMIT 5;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,make,model,total_sales,total_revenue
0,Ford,Mustang,5,233500.0
1,Tesla,Model 3,4,184500.0
2,Audi,A4,2,81500.0
3,BMW,X5,1,63000.0
4,Subaru,Outback,2,59500.0


In [76]:
%%pydough
result = Cars.CALCULATE(
    make,
    model,
    total_sales=COUNT(sale_records._id),
    total_revenue=SUM(sale_records.sale_price)
).TOP_K(5, by=total_revenue.DESC())

pydough.to_df(result)


Unnamed: 0,make,model,total_sales,total_revenue
0,Ford,Mustang,5,233500.0
1,Tesla,Model 3,4,184500.0
2,Audi,A4,2,81500.0
3,BMW,X5,1,63000.0
4,Subaru,Outback,2,59500.0


In [33]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="a")

True

### 9. SQLite BasicQuery 9

What are the total number of customer signups for the top 2 states? Return the state and total signups, starting from the top.

```SQL
SELECT state, COUNT(*) AS total_signups 
FROM customers 
GROUP BY state 
ORDER BY CASE WHEN total_signups IS NULL THEN 1 ELSE 0 END DESC, total_signups DESC LIMIT 2;
```



In [83]:
# Define la consulta SQL en PyDough
query = """
SELECT state, COUNT(*) AS total_signups 
FROM customers 
GROUP BY state 
ORDER BY CASE WHEN total_signups IS NULL THEN 1 ELSE 0 END DESC, total_signups DESC LIMIT 2;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,state,total_signups
0,CA,4
1,TX,3


In [84]:
%%pydough
result = PARTITION(Customers, name="grouped", by=state).CALCULATE(
    state,
    total_signups=COUNT(grouped._id) 
).TOP_K(2, by=total_signups.DESC())

pydough.to_df(result)



Unnamed: 0,state,total_signups
0,CA,4
1,TX,3


In [85]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="a")

True

### 10. SQLite BasicQuery 10 (Date)

Who were the top 3 sales representatives by total revenue in the past 3 months, inclusive of today's date? Return their first name, last name, total number of sales and total revenue. Note that revenue refers to the sum of sale_price in the sales table.

```SQL
SELECT c.first_name, c.last_name, COUNT(s.id) AS total_sales, 
SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN salespersons AS c ON s.salesperson_id = c.id 
WHERE s.sale_date >= DATE('now', '-3 months') 
GROUP BY c.first_name, c.last_name 
ORDER BY total_revenue DESC LIMIT 3;
```



In [28]:
# Define la consulta SQL en PyDough
query = """
SELECT c.first_name, c.last_name, COUNT(s._id) AS total_sales, 
SUM(s.sale_price) AS total_revenue 
FROM sales AS s 
JOIN salespersons AS c ON s.salesperson_id = c._id 
WHERE s.sale_date >= DATE('now', '-3 months') 
GROUP BY c.first_name, c.last_name 
ORDER BY total_revenue DESC LIMIT 3;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,total_sales,total_revenue
0,John,Doe,4,168000.0
1,Jane,Smith,3,140000.0
2,Michael,Johnson,2,69700.0


In [None]:
%%pydough

three_months_ago = DATETIME("now", "-3 months", "start of month")  

result = Salespersons.CALCULATE(
    first_name,
    last_name,
    total_sales=COUNT(sales_made.WHERE(sale_date >= three_months_ago)._id),  
    total_revenue=SUM(sales_made.WHERE(sale_date >= three_months_ago).sale_price)  
).TOP_K(3, by=total_revenue.DESC())

pydough.to_df(result)


Unnamed: 0,first_name,last_name,total_sales,total_revenue
0,John,Doe,4,168000.0
1,Jane,Smith,3,140000.0
2,Michael,Johnson,2,69700.0


In [30]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="Who were the top 3 sales representatives by total revenue in the past 3 months, inclusive of today's date? Return their first name, last name, total number of sales and total revenue. Note that revenue refers to the sum of sale_price in the sales table.")

True

### 11. SQLite Generated Query 1 (Date)

Return the name and phone number of the salesperson with the shortest time from being hired to getting fired. Return the number of days he/she was employed for.

```SQL
SELECT s.first_name, s.last_name, s.phone, julianday(s.termination_date) - julianday(s.hire_date) AS days_employed 
FROM salespersons AS s 
ORDER BY CASE WHEN days_employed IS NULL THEN 1 ELSE 0 END, days_employed ASC LIMIT 1;
```



In [89]:
# Define la consulta SQL en PyDough
query = """
SELECT s.first_name, s.last_name, s.phone, julianday(s.termination_date) - julianday(s.hire_date) AS days_employed 
FROM salespersons AS s 
ORDER BY CASE WHEN days_employed IS NULL THEN 1 ELSE 0 END, days_employed ASC LIMIT 1;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,phone,days_employed
0,Olivia,Thomas,(333)-415-0000,181.0


In [87]:
%%pydough

result = Salespersons.WHERE(PRESENT(termination_date)).CALCULATE(
    first_name,
    last_name,
    phone,
    days_employed=DATEDIFF("days", hire_date, termination_date)
).TOP_K(1, by=days_employed.ASC())

pydough.to_df(result)


DATEDIFF unsupported for 'DAYS'.
DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,first_name,last_name,phone,days_employed
0,Olivia,Thomas,(333)-415-0000,181


In [90]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", question="Return the name and phone number of the salesperson with the shortest time from being hired to getting fired. Return the number of days he/she was employed for.")

DATEDIFF unsupported for 'DAYS'.
DATEDIFF unsupported for 'DAYS'.


True

### 12. SQLite Generated Query 2 (PaymentsMade)

Return the number of payments made on weekends to the vendor named 'Utility Company'

```SQL
SELECT COUNT(*) AS weekend_payments 
FROM payments_made 
WHERE vendor_name = 'Utility Company' 
AND strftime('%w', payment_date) IN ('0', '6');
```



In [38]:
# Define la consulta SQL en PyDough
query = """
SELECT COUNT(*) AS weekend_payments 
FROM payments_made 
WHERE vendor_name = 'Utility Company' 
AND strftime('%w', payment_date) IN ('0', '6');
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,weekend_payments
0,1


In [442]:
%%pydough

payment_days_since_start = PaymentsMade.CALCULATE(
    payment_date,
    days_since_start=DATEDIFF("days", DATETIME("now", "start of month"), payment_date)
)

result = payment_days_since_start.WHERE(
    (vendor_name == "Utility Company") & ((DATEDIFF("days", DATETIME("now", "start of week"), payment_date) >= 5) & (DATEDIFF("days", DATETIME("now", "start of week"), payment_date) >= 6)    
    )
).CALCULATE(
    weekend_payments=COUNT(_id)
)

pydough.to_df(result)


PyDoughQDAGException: Unrecognized term of graph 'Dealership': 'PaymentsMade'

In [None]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", question="Return the number of payments made on weekends to the vendor named 'Utility Company'")

### 13. SQLite Generated Query 3 !!!!!!!

Show me the daily total amount of payments received in the whole of the previous ISO week not including the current week, split by the payment_method.


```SQL
SELECT payment_date, payment_method, SUM(payment_amount) AS total_amount 
FROM payments_received 
WHERE payment_date >= DATE('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days', '-7 days') 
AND payment_date < DATE('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days') 
GROUP BY payment_date, payment_method ORDER BY payment_date DESC, payment_method ASC;
```
START OF WEEK IS NOT IMPLEMENTED YET.


In [220]:
# Define la consulta SQL en PyDough
query = """
SELECT payment_date, payment_method, SUM(payment_amount) AS total_amount 
FROM payments_received 
WHERE payment_date >= DATE('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days', '-7 days') 
AND payment_date < DATE('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days') 
GROUP BY payment_date, payment_method ORDER BY payment_date DESC, payment_method ASC;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,payment_date,payment_method,total_amount
0,2025-02-15,debit_card,26500.0
1,2025-02-14,debit_card,24000.0
2,2025-02-14,financing,115000.0
3,2025-02-13,credit_card,115000.0
4,2025-02-12,debit_card,115000.0


In [None]:
%%pydough

#Start of week is not implemented yet
payments = PaymentsReceived.WHERE(
    (DATEDIFF("days", DATETIME("now", "start of week", "-7 D"), payment_date) >= 0) &
    (DATEDIFF("days", payment_date, DATETIME("now", "start of week", "-1 D")) >= 0)
)

result = PARTITION(payments, name="grp", by=(payment_date, payment_method)).CALCULATE(
    payment_date,
    payment_method,
    total_amount=SUM(grp.payment_amount)
).ORDER_BY(payment_date.DESC(), payment_method.ASC())

pydough.to_df(result)


DATEDIFF unsupported for 'DAYS'.
DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,payment_date,payment_method,total_amount
0,2025-02-15,debit_card,26500.0
1,2025-02-14,debit_card,24000.0
2,2025-02-14,financing,115000.0
3,2025-02-13,credit_card,115000.0
4,2025-02-12,debit_card,115000.0


In [None]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", question="Show me the daily total amount of payments received in the whole of the previous ISO week not including the current week, split by the payment_method.")

True

### 14. SQLite Generated Query 3

What were the total quarterly sales in 2023 grouped by customer's state? Represent each quarter as the first date in the quarter.

```SQL
SELECT CASE WHEN strftime('%m', s.sale_date) BETWEEN '01' AND '03' THEN '2023-01-01' 
WHEN strftime('%m', s.sale_date) BETWEEN '04' AND '06' THEN '2023-04-01' 
WHEN strftime('%m', s.sale_date) BETWEEN '07' AND '09' THEN '2023-07-01' ELSE '2023-10-01' END AS quarter, 
c.state, SUM(s.sale_price) AS total_sales 
FROM sales AS s 
JOIN customers AS c 
ON s.customer_id = c.id 
WHERE strftime('%Y', s.sale_date) = '2023' 
GROUP BY c.state, quarter 
HAVING SUM(s.sale_price) > 0 
ORDER BY quarter, c.state;
```



In [150]:
# Define la consulta SQL en PyDough
query = """
SELECT CASE WHEN strftime('%m', s.sale_date) BETWEEN '01' AND '03' THEN '2023-01-01' 
WHEN strftime('%m', s.sale_date) BETWEEN '04' AND '06' THEN '2023-04-01' 
WHEN strftime('%m', s.sale_date) BETWEEN '07' AND '09' THEN '2023-07-01' ELSE '2023-10-01' END AS quarter, 
c.state, SUM(s.sale_price) AS total_sales 
FROM sales AS s 
JOIN customers AS c 
ON s.customer_id = c._id 
WHERE strftime('%Y', s.sale_date) = '2023' 
GROUP BY c.state, quarter 
HAVING SUM(s.sale_price) > 0 
ORDER BY quarter, c.state;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,quarter,state,total_sales
0,2023-01-01,AZ,47000.0
1,2023-01-01,CA,26500.0
2,2023-01-01,IL,30500.0
3,2023-01-01,TX,61500.0
4,2023-04-01,CA,105500.0
5,2023-04-01,NY,30000.0
6,2023-04-01,PA,26800.0
7,2023-04-01,TX,44500.0


In [None]:
%%pydough


filtered_sales = Sales.WHERE(YEAR(sale_date) == 2023).CALCULATE(
    sale_price,
    quarter=IFF(
        MONTH(sale_date) <= 3, "2023-01-01",
        IFF(MONTH(sale_date) <= 6, "2023-04-01",
        IFF(MONTH(sale_date) <= 9, "2023-07-01", "2023-10-01"))
    ),
    customer_state=customer.state,
)

result = PARTITION(filtered_sales, name="s", by=(quarter, customer_state)).CALCULATE(
    quarter,
    customer_state,
    total_sales=SUM(s.sale_price)
).WHERE(total_sales > 0).ORDER_BY(quarter.ASC(), customer_state.ASC())

pydough.to_df(result)


Unnamed: 0,quarter,customer_state,total_sales
0,2023-01-01,AZ,47000.0
1,2023-01-01,CA,26500.0
2,2023-01-01,IL,30500.0
3,2023-01-01,TX,61500.0
4,2023-04-01,CA,105500.0
5,2023-04-01,NY,30000.0
6,2023-04-01,PA,26800.0
7,2023-04-01,TX,44500.0


In [154]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", question="Show me the daily total amount of payments received in the whole of the previous ISO week not including the current week, split by the payment_method.")

True

### 15. SQLite Generated Query 4

Which cars were in inventory in the latest snapshot for march 2023? Return the car id, make, model, and year. Cars are considered to be in inventory" if is_in_inventory is True."

```SQL
WITH latest_snapshot AS (SELECT MAX(snapshot_date) AS snapshot_date 
FROM inventory_snapshots 
WHERE snapshot_date BETWEEN '2023-03-01' AND '2023-03-31'), latest_snapshot_data AS 
(SELECT inv.car_id 
FROM inventory_snapshots AS inv 
JOIN latest_snapshot AS ls 
ON inv.snapshot_date = ls.snapshot_date WHERE inv.is_in_inventory = TRUE) 
SELECT c.id, c.make, c.model, c.year 
FROM cars AS c 
JOIN latest_snapshot_data AS lsd 
ON c.id = lsd.car_id;
```



In [58]:
# Define la consulta SQL en PyDough
query = """
WITH latest_snapshot AS (SELECT MAX(snapshot_date) AS snapshot_date 
FROM inventory_snapshots 
WHERE snapshot_date BETWEEN '2023-03-01' AND '2023-03-31'), latest_snapshot_data AS 
(SELECT inv.car_id 
FROM inventory_snapshots AS inv 
JOIN latest_snapshot AS ls 
ON inv.snapshot_date = ls.snapshot_date WHERE inv.is_in_inventory = TRUE) 
SELECT c._id, c.make, c.model, c.year 
FROM cars AS c 
JOIN latest_snapshot_data AS lsd 
ON c._id = lsd.car_id;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,_id,make,model,year
0,1,Toyota,Camry,2022
1,3,Ford,Mustang,2023


In [None]:
%%pydough

# Step 1: Partition InventorySnapshots by a constant key to compute the latest snapshot date
latest_snapshot = PARTITION(
    InventorySnapshots.WHERE(
        (snapshot_date >= "2023-03-01") & 
        (snapshot_date <= "2023-03-31")
    ),
    name="inv", by=(snapshot_date) 
).CALCULATE(
    snapshot_date,
    latest_snapshot_date=MAX(inv.snapshot_date)
)

# Step 2: Get car IDs from inventory in that latest snapshot
latest_snapshot_data = InventorySnapshots.WHERE( 
    (is_in_inventory == True)
).CALCULATE(
    car_id
)


result = Cars.WHERE(
      HAS(inventory_snapshots.WHERE(
        (is_in_inventory == 1)
)))

pydough.to_df(latest_snapshot)



Unnamed: 0,_id,make,model,year,color,vin_number,engine_type,transmission,cost,crtd_ts
0,1,Toyota,Camry,2022,Silver,4T1BF1FK3CU510984,V6,Automatic,28500.0,2025-02-20 20:11:44
1,2,Honda,Civic,2021,platinum/grey,2HGFC2F53MH522780,Inline 4,CVT,22000.0,2025-02-20 20:11:44
2,3,Ford,Mustang,2023,blue,1FA6P8TH4M5100001,V8,Manual,45000.0,2025-02-20 20:11:44
3,4,Tesla,Model 3,2022,fuschia,5YJ3E1EB7MF123456,Electric,Automatic,41000.0,2025-02-20 20:11:44
4,5,Chevrolet,Equinox,2021,midnight blue,2GNAXUEV1M6290124,Inline 4,Automatic,26500.0,2025-02-20 20:11:44
5,6,Nissan,Altima,2022,Jet black,1N4BL4BV4NN123456,V6,CVT,25000.0,2025-02-20 20:11:44
6,7,BMW,X5,2023,Titan Silver,5UXCR6C56M9A12345,V8,Automatic,62000.0,2025-02-20 20:11:44
7,8,Audi,A4,2022,Blue,WAUBNAF47MA098765,Inline 4,Automatic,39000.0,2025-02-20 20:11:44
8,9,Lexus,RX350,2021,Fiery red,2T2BZMCA7MC143210,V6,Automatic,45500.0,2025-02-20 20:11:44
9,10,Subaru,Outback,2022,Jade,4S4BSANC2N3246801,Boxer 4,CVT,28000.0,2025-02-20 20:11:44


### 16. SQLite Advanced Query 1

For sales with sale price over $30,000, how many payments were received in total and on weekends in each of the last 8 calendar weeks (excluding the current week)? Return the week (as a date), total payments received, and weekend payments received in ascending order.

Weekend days are Saturday (6) and Sunday (0). Truncate date to week for aggregation. A week begins on 'weekday 1'

To calculate the average days between sale date and payment received date, join the sales and payments received tables. Weekend days are Saturday (6) and Sunday (0). Truncate date to week for aggregation. When using car makes, model names, engine_type and vin_number, match case-insensitively and allow partial matches using LIKE with wildcards. To get the total sales amount per salesperson, join the salespersons and sales tables, group by salesperson, and sum the sale_price

```SQL
SELECT date(p.payment_date,  '-' || ((strftime('%w', p.payment_date) + 6) % 7) || ' days') AS week, 
COUNT(p.id) AS total_payments, COUNT(CASE WHEN strftime('%w', p.payment_date) IN ('0', '6') THEN 1 END) AS weekend_payments 
FROM payments_received AS p 
JOIN sales AS s ON p.sale_id = s.id 
WHERE s.sale_price > 30000 
AND p.payment_date >= date('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days', '-56 days') 
AND p.payment_date < date('now',  '-' || ((strftime('%w', 'now') + 6) % 7) || ' days') 
GROUP BY week ORDER BY week ASC;
```



In [236]:
%%pydough
# Step 1: Filter sales with sale price > 30,000
high_value_sales = Sales.CALCULATE(
    _id,
    sale_price
).WHERE(sale_price > 30000)

# Step 2: Join payments to these sales
valid_payments = PaymentsReceived.WHERE(HAS(high_value_sales)).CALCULATE(
    payment_date,
    _id
)


pydough.to_df(valid_payments)


PyDoughQDAGException: Unrecognized term of simple table collection 'PaymentsReceived' in graph 'Dealership': 'Sales'

### 17. SQLite Advanced Query 2 (DATE)

How many sales did each salesperson make in the past 30 days, inclusive of today's date? Return their ID, first name, last name and number of sales made, ordered from most to least sales.

To get the number of sales made by each salesperson in the past 30 days, join the salespersons and sales tables and filter for sales in the last 30 days.

"When using car makes, model names, engine_type, and vin_number, ensure matching is case-insensitive and allows for partial matches using LIKE with wildcards.
To get the number of sales made by each salesperson in the past 30 days, join the salespersons and sales tables and filter for sales in the last 30 days.
ASP = Calculate the average sale price without specifying the period
GPM = Define gross profit margin as a ratio without specifying how to calculate total revenue or total cost"

```SQL
WITH recent_sales AS (
    SELECT sp._id, sp.first_name, sp.last_name, COUNT(s._id) AS num_sales
    FROM salespersons AS sp
    LEFT JOIN sales AS s ON sp._id = s.salesperson_id
    WHERE s.sale_date >= DATE('now', '-30 days')
    GROUP BY sp._id
) 
SELECT _id, first_name, last_name, num_sales FROM recent_sales
ORDER BY num_sales DESC;
```


In [241]:
query = """
WITH recent_sales AS (
    SELECT sp._id, sp.first_name, sp.last_name, COUNT(s._id) AS num_sales
    FROM salespersons AS sp
    LEFT JOIN sales AS s ON sp._id = s.salesperson_id
    WHERE s.sale_date >= DATE('now', '-30 days')
    GROUP BY sp._id
) 
SELECT _id, first_name, last_name, num_sales FROM recent_sales
ORDER BY num_sales DESC;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,_id,first_name,last_name,num_sales
0,2,Jane,Smith,3
1,3,Michael,Johnson,2
2,1,John,Doe,1
3,4,Emily,Brown,1
4,6,Sarah,Taylor,1


In [244]:
%%pydough

date_threshold = DATETIME("now", "-30 days")

result = Salespersons.WHERE(
    HAS(sales_made.WHERE(sale_date >= date_threshold))
).CALCULATE(
    _id,
    first_name,
    last_name,
    num_sales=COUNT(sales_made.WHERE(sale_date >= date_threshold)._id)
).ORDER_BY(num_sales.DESC())

pydough.to_df(result)



Unnamed: 0,_id,first_name,last_name,num_sales
0,2,Jane,Smith,3
1,3,Michael,Johnson,2
2,1,John,Doe,1
3,4,Emily,Brown,1
4,6,Sarah,Taylor,1


In [90]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", question="How many sales did each salesperson make in the past 30 days, inclusive of today's date? Return their ID, first name, last name and number of sales made, ordered from most to least sales.")

True

### 18. SQLite Advanced Query 3

How many sales were made for each car model that has 'M5' in its VIN number? Return the make, model and number of sales.

When using car makes, model names, engine_type and vin_number, match case-insensitively and allow partial matches using LIKE with wildcards.

To determine the total sales amount for each salesperson, combine data from the salespersons and sales tables, grouping by salesperson and summing the sale_price
When using car makes, model names, engine_type and vin_number, match case-insensitively and allow partial matches using LIKE with wildcards.
To calculate the average selling price, join the sales and cars tables, and divide the total sales amount by the number of sales
For understanding the number of sales achieved by each salesperson within a specified period, merge the salespersons and sales tables and apply a filter based on the given time frame.

```SQL
SELECT c.make, c.model, COUNT(s.id) AS num_sales 
FROM cars AS c 
LEFT JOIN sales AS s ON c.id = s.car_id 
WHERE LOWER(c.vin_number) 
LIKE '%m5%' 
GROUP BY c.make, c.model;
```



In [246]:
query = """
SELECT c.make, c.model, COUNT(s._id) AS num_sales 
FROM cars AS c 
LEFT JOIN sales AS s ON c._id = s.car_id 
WHERE LOWER(c.vin_number) 
LIKE '%m5%' 
GROUP BY c.make, c.model;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,make,model,num_sales
0,Ford,Mustang,5


In [245]:
%%pydough

result = Cars.CALCULATE(
    make, 
    model,
    num_sales=COUNT(sale_records._id)
    ).WHERE(
        LIKE(vin_number, "%m5%") 
).ORDER_BY(num_sales.DESC())

pydough.to_df(result)


Unnamed: 0,make,model,num_sales
0,Ford,Mustang,5


In [247]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_string_matching", question="How many sales were made for each car model that has 'M5' in its VIN number? Return the make, model and number of sales.")

True

### 19. SQLite Advanced Query 4 (Date)

How many Toyota cars were sold in the last 30 days inclusive of today? Return the number of sales and total revenue.

To calculate the average days between sale date and payment received date, join the sales and payments_received tables
To get the list of cars that were sold and their sale price, join the cars and sales tables
Last 30 days = DATE('now', -'30 days') to DATE('now'). Always join sales with cars before using the sales table.
When using car makes, model names, engine_type, and vin_number, match case-insensitively and allow partial matches using LIKE with wildcards.

```SQL
SELECT COUNT(s.id) AS num_sales, SUM(s.sale_price) AS total_revenue FROM sales AS s 
JOIN cars AS c 
ON s.car_id = c.id
WHERE c.make = 'Toyota' AND s.sale_date BETWEEN DATE('now', '-30 days') AND DATE('now');
```



In [249]:
query = """
SELECT COUNT(s._id) AS num_sales, SUM(s.sale_price) AS total_revenue FROM sales AS s 
JOIN cars AS c 
ON s.car_id = c._id
WHERE c.make = 'toyota' AND s.sale_date BETWEEN DATE('now', '-30 days') AND DATE('now');
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,num_sales,total_revenue
0,0,


In [None]:
%%pydough

date_threshold = DATETIME("now", "-30 days")

result = Cars.WHERE(
    LIKE(LOWER(make), "%toyota%") 
).CALCULATE(
    num_sales=COUNT(sale_records.WHERE(sale_date >= date_threshold)._id),
    total_revenue=SUM(sale_records.WHERE(sale_date >= date_threshold).sale_price)
)


pydough.to_df(result)

Unnamed: 0,num_sales,total_revenue
0,0,0


In [103]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_date_join", question="How many Toyota cars were sold in the last 30 days inclusive of today? Return the number of sales and total revenue.")

  df_gen.fillna(-99999, inplace=True)


np.False_

### 20. SQLite Advanced Query 5

Return the first name, last name, total sales amount, number of sales, and SR for each salesperson.

SR = sales rank of each salesperson ordered by their total sales amount descending To determine the sales performance per territory, sum the sales amount and count the sales, grouping by territory To calculate the average sale price, join the sales table with itself on the salesperson_id and find the ratio of total sales amount to number of sales To assess inventory turnover, compare inventory snapshots with sales on matching days, focusing on the quantity of items sold.

```SQL
WITH salesperson_sales AS (
    SELECT 
        salesperson_id, 
        SUM(sale_price) AS total_sales, 
        COUNT(*) AS num_sales 
    FROM sales 
    GROUP BY salesperson_id
) 
SELECT 
    s.first_name, 
    s.last_name, 
    ss.total_sales, 
    ss.num_sales, 
    RANK() OVER (
        ORDER BY 
            CASE WHEN ss.total_sales IS NULL THEN 1 ELSE 0 END DESC, 
            ss.total_sales DESC
    ) AS sales_rank 
FROM salesperson_sales AS ss 
JOIN salespersons AS s ON ss.salesperson_id = s._id;

```



In [254]:
query = """
WITH salesperson_sales AS (
    SELECT 
        salesperson_id, 
        SUM(sale_price) AS total_sales, 
        COUNT(*) AS num_sales 
    FROM sales 
    GROUP BY salesperson_id
) 
SELECT 
    s.first_name, 
    s.last_name, 
    ss.total_sales, 
    ss.num_sales, 
    RANK() OVER (
        ORDER BY 
            CASE WHEN ss.total_sales IS NULL THEN 1 ELSE 0 END DESC, 
            ss.total_sales DESC
    ) AS sales_rank 
FROM salesperson_sales AS ss 
JOIN salespersons AS s ON ss.salesperson_id = s._id;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,total_sales,num_sales,sales_rank
0,Jane,Smith,278000.0,6,1
1,John,Doe,215000.0,5,2
2,Sarah,Taylor,116000.0,3,3
3,Michael,Johnson,96500.0,3,4
4,Emily,Brown,79500.0,3,5
5,Daniel,Anderson,66900.0,2,6


In [443]:
%%pydough

total_sales= SUM(sales_made.sale_price)

result = Salespersons.WHERE(HAS(sales_made)).CALCULATE(
    first_name, 
    last_name,
    total_sales=total_sales,  
    num_sales=COUNT(sales_made._id),
    sales_rank=RANKING(by=total_sales.DESC())  
).ORDER_BY(total_sales.DESC())

pydough.to_df(result)



Unnamed: 0,first_name,last_name,total_sales,num_sales,sales_rank
0,Jane,Smith,278000.0,6,1
1,John,Doe,215000.0,5,2
2,Sarah,Taylor,116000.0,3,3
3,Michael,Johnson,96500.0,3,4
4,Emily,Brown,79500.0,3,5
5,Daniel,Anderson,66900.0,2,6


In [255]:
pydough_output = pydough.to_df(result)

dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_window", question="Return the first name, last name, total sales amount, number of sales, and SR for each salesperson")



True

### 21. SQLite Advanced Query 6

Return the highest sale price for each make and model of cars that have been sold and are no longer in inventory, ordered by the sale price from highest to lowest. Use the most recent date in the inventory_snapshots table to determine that car's inventory status.

"Recall that a car can have multiple entries in the inventory_snapshot table. 
TSC = Count of sales within a specified period
MoM = Change in total receivable amounts from one month to the next, comparing with the immediately preceding month.
ASP = Mean sale price for a designated start period
When getting a car's inventory status, always take the latest status from the inventory_snapshots table"

```SQL
WITH latest_inventory_status AS (
    SELECT 
        car_id, 
        is_in_inventory, 
        ROW_NUMBER() OVER (
            PARTITION BY car_id 
            ORDER BY 
                CASE WHEN snapshot_date IS NULL THEN 1 ELSE 0 END DESC, 
                snapshot_date DESC
        ) AS rn
    FROM inventory_snapshots
) 
SELECT 
    c.make, 
    c.model, 
    MAX(s.sale_price) AS highest_sale_price 
FROM cars AS c 
JOIN sales AS s ON c.id = s.car_id 
JOIN latest_inventory_status AS lis ON c.id = lis.car_id 
WHERE lis.is_in_inventory = FALSE 
AND lis.rn = 1 
GROUP BY c.make, c.model 
ORDER BY 
    CASE WHEN highest_sale_price IS NULL THEN 1 ELSE 0 END DESC, 
    highest_sale_price DESC;

```



In [444]:
%%pydough

result= Cars.WHERE(BEST(inventory_snapshots, by=snapshot_date.DESC()).is_in_inventory).CALCULATE(
 make,
 model,
 MAX(sales.sale_price)
)

pydough.to_df(result)


PyDoughUnqualifiedException: PyDough nodes BEST is not callable. Did you mean to use a function?

### 22. SQLite Advanced Query 7 

What are the details and average sale price for cars that have 'Ford' in their make name or 'Mustang' in the model name? Return make, model, year, color, vin_number and avg_sale_price.

To calculate the gross profit margin, use the formula where GPM is the quotient of total revenue minus total cost over total cost, multiplied by 100 When using car makes, model names, engine_type and vin_number, match case-insensitively and allow partial matches using LIKE with wildcards. To compute the average sales price, divide the total sales amount by the number of sales To determine the sales rank of each salesperson, order by their total sales amount in descending order.

```SQL
SELECT c.make, c.model, c.year, c.color, c.vin_number, AVG(s.sale_price) AS avg_sale_price 
FROM cars AS c 
JOIN sales AS s 
ON c.id = s.car_id
 WHERE LOWER(c.make) LIKE '%ford%' OR LOWER(c.model) LIKE '%mustang%' 
 GROUP BY c.make, c.model, c.year, c.color, c.vin_number;
```



In [266]:
query = """
SELECT c.make, c.model, c.year, c.color, c.vin_number, AVG(s.sale_price) AS avg_sale_price 
FROM cars AS c 
JOIN sales AS s 
ON c._id = s.car_id
 WHERE LOWER(c.make) LIKE '%ford%' OR LOWER(c.model) LIKE '%mustang%' 
 GROUP BY c.make, c.model, c.year, c.color, c.vin_number;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,make,model,year,color,vin_number,avg_sale_price
0,Ford,Mustang,2023,blue,1FA6P8TH4M5100001,46700.0


In [None]:
%%pydough

result = Cars.WHERE(
    LIKE(LOWER(make), "%ford%") | LIKE(LOWER(model), "%mustang%") 
).CALCULATE(
    make,
    model,
    year,
    color,
    vin_number,
    avg_sale_price=AVG(sale_records.sale_price)  
) 

pydough.to_df(result)

Unnamed: 0,make,model,year,color,vin_number,avg_sale_price
0,Ford,Mustang,2023,blue,1FA6P8TH4M5100001,46700.0


### 23. SQLite Advanced Query 8

What are the PMSPS and PMSR in the last 6 months excluding the current month, for salespersons hired between 2022 and 2023 (both inclusive)? Return all months in your answer, including those where metrics are 0. Order by month ascending.

"PMSPS = per month salesperson sales count. PMSR = per month sales revenue in dollars. Truncate date to month for aggregation.
ASP = Average Sale Price during a specific timeframe
To calculate the average days between a sale date and when the payment was received, join the relevant tables.
TSC = Total Sales Count for a given period"

```SQL
WITH RECURSIVE date_range(month_start) AS (
    SELECT DATE('now', '-6 months', 'start of month') AS month_start
    UNION ALL
    SELECT DATE(month_start, '+1 month')
    FROM date_range
    WHERE month_start < DATE('now', '-1 month', 'start of month')
),
sales_metrics AS (
    SELECT 
        strftime('%Y-%m', s.sale_date) AS sale_month,
        COUNT(s.id) AS PMSPS,
        SUM(s.sale_price) AS PMSR
    FROM sales AS s
    JOIN salespersons AS sp ON s.salesperson_id = sp.id
    WHERE 
        strftime('%Y', sp.hire_date) BETWEEN '2022' AND '2023'
        AND s.sale_date >= DATE('now', '-6 months', 'start of month')
        AND s.sale_date < DATE('now', 'start of month')
    GROUP BY sale_month
)
SELECT 
    dr.month_start,
    COALESCE(sm.PMSPS, 0) AS PMSPS,
    COALESCE(sm.PMSR, 0) AS PMSR
FROM date_range AS dr
LEFT JOIN sales_metrics AS sm 
    ON strftime('%Y-%m', dr.month_start) = sm.sale_month
ORDER BY dr.month_start ASC;
```
Notes:

Missing Months with Zero Sales:

SQL ensures all months appear using a recursive CTE (WITH RECURSIVE date_range).
PyDough does not have a built-in way to generate an explicit list of months.
As a result, PyDough only includes months that have at least one sale.

Difference in Data Aggregation

The SQL query truncates dates to the first day of the month (strftime('%Y-%m', s.sale_date)).
The PyDough query constructs sale_month = YEAR(sale_date) * 100 + MONTH(sale_date), which may cause formatting differences.
Total Revenue (PMSR) Difference

In SQL, SUM(s.sale_price) is grouped by all months, even those with 0 sales.
In PyDough, the sum is computed only for months that exist in sales data.

In [306]:
query = """
WITH RECURSIVE date_range(month_start) AS (
    SELECT DATE('now', '-6 months', 'start of month') AS month_start
    UNION ALL
    SELECT DATE(month_start, '+1 month')
    FROM date_range
    WHERE month_start < DATE('now', '-1 month', 'start of month')
),
sales_metrics AS (
    SELECT 
        strftime('%Y-%m', s.sale_date) AS sale_month,
        COUNT(s._id) AS PMSPS,
        SUM(s.sale_price) AS PMSR
    FROM sales AS s
    JOIN salespersons AS sp ON s.salesperson_id = sp._id
    WHERE 
        strftime('%Y', sp.hire_date) BETWEEN '2022' AND '2023'
        AND s.sale_date >= DATE('now', '-6 months', 'start of month')
        AND s.sale_date < DATE('now', 'start of month')
    GROUP BY sale_month
)
SELECT 
    dr.month_start,
    COALESCE(sm.PMSPS, 0) AS PMSPS,
    COALESCE(sm.PMSR, 0) AS PMSR
FROM date_range AS dr
LEFT JOIN sales_metrics AS sm 
    ON strftime('%Y-%m', dr.month_start) = sm.sale_month
ORDER BY dr.month_start ASC;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,month_start,PMSPS,PMSR
0,2024-08-01,0,0.0
1,2024-09-01,0,0.0
2,2024-10-01,0,0.0
3,2024-11-01,0,0.0
4,2024-12-01,3,142000.0
5,2025-01-01,0,0.0


In [430]:
%%pydough

eligible_salespersons = Salespersons.WHERE(
    (YEAR(hire_date) >= 2022) & (YEAR(hire_date) <= 2023) 
)


filtered_sales = Sales.WHERE(
    (DATEDIFF("months", sale_date, DATETIME("now", "start of month")) >= 1) &  
    (DATEDIFF("months", sale_date, DATETIME("now", "start of month")) <= 6) &
    (HAS(salesperson.WHERE((YEAR(hire_date) >= 2022) & (YEAR(hire_date) <= 2023))))
).CALCULATE(
    sale_price,
    sale_month=DATETIME(sale_date, "start of month")
)


sales_metrics = PARTITION(filtered_sales, name="s", by=sale_month).CALCULATE(
    sale_month,
    PMSPS=COUNT(s._id),
    PMSR=SUM(s.sale_price) 
)

result = sales_metrics.ORDER_BY(sale_month.ASC())

pydough.to_df(result)



Unnamed: 0,sale_month,PMSPS,PMSR
0,2024-12-01 00:00:00,3,142000.0


### 24. SQLite Advanced Query 9

What is the ASP for sales made in the first quarter of 2023?

```SQL
SELECT AVG(sale_price) AS ASP 
FROM sales 
WHERE sale_date >= '2023-01-01' AND sale_date <= '2023-03-31';
```

In [308]:
query = """
SELECT AVG(sale_price) AS ASP 
FROM sales 
WHERE sale_date >= '2023-01-01' AND sale_date <= '2023-03-31';
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,ASP
0,33100.0


In [307]:
%%pydough


result = Dealership.CALCULATE(
    ASP=AVG(
        Sales.WHERE((sale_date >= "2023-01-01") & (sale_date <= "2023-03-31")).sale_price
    )
)


pydough.to_df(result)


Unnamed: 0,ASP
0,33100.0


### 25. SQLite Advanced Query 10

What is the average number of days between the sale date and payment received date, rounded to 2 decimal places?

```SQL
WITH sale_payments AS (SELECT s.id AS sale_id, s.sale_date, MAX(p.payment_date) AS latest_payment_date 
FROM sales AS s 
JOIN payments_received AS p 
ON s.id = p.sale_id 
GROUP BY s.id, s.sale_date) 
SELECT ROUND(AVG(julianday(latest_payment_date) - julianday(sale_date)), 2) AS avg_days_to_paymen 
FROM sale_payments;
```



In [446]:
query = """
WITH sale_payments AS (SELECT s._id AS sale_id, s.sale_date, MAX(p.payment_date) AS latest_payment_date 
FROM sales AS s 
JOIN payments_received AS p 
ON s._id = p.sale_id 
GROUP BY s._id, s.sale_date) 
SELECT ROUND(AVG(julianday(latest_payment_date) - julianday(sale_date)), 2) AS avg_days_to_payment
FROM sale_payments;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,avg_days_to_payment
0,2.05


In [None]:
%%pydough

sale_payments = PARTITION(Sales.payment, name="p", by=sale_id).CALCULATE(
    sale_id,
    latest_payment_date=MAX(p.payment_date)  
)

result = PARTITION(Sales, name="s", by=_id).CALCULATE(
    days_to_payment=DATEDIFF("days", s.sale_date, sale_payments.latest_payment_date)
)

pydough.to_df(result)



PyDoughQDAGException: Unrecognized term: 'sale_date'

### 26. SQLite Advanced Query 11

What is the GPM for all car sales in 2023?

GPM (gross profit margin) = (total revenue - total cost) / total cost * 100

```SQL
SELECT (SUM(sale_price) - SUM(cars.cost)) / SUM(cars.cost) * 100 AS gpm 
FROM sales JOIN cars 
ON sales.car_id = cars.id 
WHERE strftime('%Y', sale_date) = '2023';
```

In [353]:
query = """
SELECT (SUM(sale_price) - SUM(cars.cost)) / SUM(cars.cost) * 100 AS gpm 
FROM sales JOIN cars 
ON sales.car_id = cars._id 
WHERE strftime('%Y', sale_date) = '2023';
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,gpm
0,2.703448


In [352]:
%%pydough

result = Dealership.CALCULATE(
    GPM=((SUM(Sales.WHERE(YEAR(sale_date) == 2023).sale_price) - SUM(Sales.WHERE(YEAR(sale_date) == 2023).car.cost)) 
        / SUM(Sales.WHERE(YEAR(sale_date) == 2023).car.cost)) * 100
)

pydough.to_df(result)


Unnamed: 0,GPM
0,2.703448


### 27. SQLite Advanced Query 12

What is the make, model and sale price of the car with the highest sale price that was sold on the same day it went out of inventory?

```SQL
SELECT c.make, c.model, s.sale_price FROM cars AS c 
JOIN sales AS s ON c.id = s.car_id 
JOIN inventory_snapshots AS i ON c.id = i.car_id 
AND DATE(s.sale_date) = DATE(i.snapshot_date) 
WHERE i.is_in_inventory = 0 
ORDER BY s.sale_price DESC LIMIT 1;
```



In [164]:
%%pydough

# Step 1: Filter for sales where sale_date matches the snapshot_date
selected_sales = Sales.WHERE(
    HAS(inventory_snapshots)  
)

# Step 2: Filter inventory snapshots for cars that were removed from inventory
result = InventorySnapshots.WHERE(is_in_inventory == False).car(
    make,
    model,
    best_sale_price=MAX(sale_records.WHERE(HAS(selected_sales)).sale_price)  # Get highest sale price
).TOP_K(1, by=best_sale_price.DESC())  # Ensure correct field name


pydough.to_df(selected_sales)


PyDoughQDAGException: Unrecognized term of simple table collection 'Sales' in graph 'Dealership': 'inventory_snapshots'

### 28. SQLite Advanced Query 13

What is the total payments received per month? Also calculate the MoM change for each month.

MoM change = (current month value - prev month value). Return all months in your answer, including those where there were no payments.

```SQL
WITH monthly_totals AS (
    SELECT 
        strftime('%Y-%m-01 00:00:00', payment_date) AS dt,
        SUM(payment_amount) AS total_payments
    FROM payments_received
    GROUP BY dt
),
monthly_totals_with_zero AS (
    SELECT dt, total_payments FROM monthly_totals
    UNION ALL
    SELECT 
        strftime('%Y-%m-01 00:00:00', date(payment_date, 'start of month', '+' || (n || ' month'))) AS dt,
        0 AS total_payments
    FROM payments_received, 
    (
        SELECT 0 AS n UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL 
        SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL 
        SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL 
        SELECT 9 UNION ALL SELECT 10 UNION ALL SELECT 11
    )
    WHERE strftime('%Y-%m-01 00:00:00', date(payment_date, 'start of month', '+' || (n || ' month'))) 
          <= strftime('%Y-%m-01 00:00:00', 'now')
    GROUP BY dt
)
SELECT 
    dt AS MONTH, 
    SUM(total_payments) AS total_payments,
    SUM(total_payments) - LAG(SUM(total_payments), 1) OVER (ORDER BY dt) AS mom_change
FROM monthly_totals_with_zero
GROUP BY dt
ORDER BY dt;
```
PyDough doesn't support date manipulation.

Pydough doesn't have LAG().


In [169]:
query = """
WITH monthly_totals AS (
    SELECT 
        strftime('%Y-%m-01 00:00:00', payment_date) AS dt,
        SUM(payment_amount) AS total_payments
    FROM payments_received
    GROUP BY dt
),
monthly_totals_with_zero AS (
    SELECT dt, total_payments FROM monthly_totals
    UNION ALL
    SELECT 
        strftime('%Y-%m-01 00:00:00', date(payment_date, 'start of month', '+' || (n || ' month'))) AS dt,
        0 AS total_payments
    FROM payments_received, 
    (
        SELECT 0 AS n UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL 
        SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL 
        SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL 
        SELECT 9 UNION ALL SELECT 10 UNION ALL SELECT 11
    )
    WHERE strftime('%Y-%m-01 00:00:00', date(payment_date, 'start of month', '+' || (n || ' month'))) 
          <= strftime('%Y-%m-01 00:00:00', 'now')
    GROUP BY dt
)
SELECT 
    dt AS MONTH, 
    SUM(total_payments) AS total_payments,
    SUM(total_payments) - LAG(SUM(total_payments), 1) OVER (ORDER BY dt) AS mom_change
FROM monthly_totals_with_zero
GROUP BY dt
ORDER BY dt;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,MONTH,total_payments,mom_change
0,2023-03-01 00:00:00,155500.0,
1,2023-04-01 00:00:00,197500.0,42000.0
2,2023-05-01 00:00:00,0.0,-197500.0
3,2023-06-01 00:00:00,0.0,0.0
4,2023-07-01 00:00:00,0.0,0.0
5,2023-08-01 00:00:00,0.0,0.0
6,2023-09-01 00:00:00,0.0,0.0
7,2023-10-01 00:00:00,0.0,0.0
8,2023-11-01 00:00:00,0.0,0.0
9,2023-12-01 00:00:00,0.0,0.0


In [484]:
%%pydough

#PyDough doesn't support date manipulation.
#PyDough doesn't have LAG(), that step is not possible.

# Step 1: Filter payments received within the desired range
filtered_payments = PaymentsReceived.WHERE(
    (payment_date >= "2024-01-01") & (payment_date <= "2024-12-31")  
)(
    payment_amount,
    payment_month=YEAR(payment_date) * 100 + MONTH(payment_date),
)

# Step 2: Compute total payments per month
monthly_totals = PARTITION(filtered_payments, name="p", by=payment_month)(
    payment_month,
    total_payments=SUM(p.payment_amount)  # Total payments received per month
)

# Step 3: Ensure months with no payments appear (MoM workaround)
# PyDough has no way to generate missing months, we assume data already covers all months.
monthly_totals_complete = PARTITION(monthly_totals, name="m", by=payment_month)(
    payment_month,
    total_payments=IFF(SUM(m.total_payments) == None, 0, SUM(m.total_payments))
)

# Step 4: Compute MoM Change (Manual Workaround)
# PyDough does not have LAG(), so this step is not possible and omitted.
result = monthly_totals_complete.ORDER_BY(payment_month.ASC())(
    payment_month,
    total_payments
)

pydough.to_df(result)


PyDoughUnqualifiedException: PyDough nodes PaymentsReceived.WHERE(((payment_date >= '2024-01-01') & (payment_date <= '2024-12-31'))) is not callable. Did you mean to use a function?

### 29. SQLite Advanced Query 14

What is the TSC in the past 7 days, inclusive of today?

```SQL
SELECT COUNT(id) AS TSC FROM sales WHERE sale_date >= DATE('now', '-7 days');
```



In [None]:
%%pydough

#PyDough doesn't support date manipulation.
result = Dealership.CALCULATE(
    TSC=COUNT(Sales.WHERE(sale_date >= "2024-04-01")._id) 
)

pydough.to_df(result)


Unnamed: 0,TSC
0,0


### 30. SQLite Advanced Query 15

Who are the top 3 salespersons by ASP? Return their first name, last name and ASP.

```SQL
SELECT salespersons.first_name, salespersons.last_name, AVG(sales.sale_price) AS ASP 
FROM sales JOIN salespersons ON sales.salesperson_id = salespersons.id 
GROUP BY salespersons.first_name, salespersons.last_name 
ORDER BY ASP DESC LIMIT 3;
```



In [174]:
query = """
SELECT salespersons.first_name, salespersons.last_name, AVG(sales.sale_price) AS ASP 
FROM sales JOIN salespersons ON sales.salesperson_id = salespersons._id 
GROUP BY salespersons.first_name, salespersons.last_name 
ORDER BY ASP DESC LIMIT 3;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,ASP
0,Jane,Smith,46333.333333
1,John,Doe,43000.0
2,Sarah,Taylor,38666.666667


In [None]:
%%pydough

result = PARTITION(Salespersons, name="sp", by=(_id, first_name, last_name)).CALCULATE(
    first_name,
    last_name,
    ASP=AVG(sp.sales_made.sale_price)  
).TOP_K(3, by=ASP.DESC())  

pydough.to_df(result)


Unnamed: 0,first_name,last_name,ASP
0,Jane,Smith,278000.0
1,John,Doe,215000.0
2,Sarah,Taylor,116000.0


### 31. SQLite Advanced Query 16

Who are the top 5 salespersons by total sales amount? Return their ID, first name, last name and total sales amount.

```SQL
WITH salesperson_sales AS (SELECT s.id, s.first_name, s.last_name, SUM(sa.sale_price) AS total_sales 
FROM salespersons AS s 
LEFT JOIN sales AS sa 
ON s.id = sa.salesperson_id 
GROUP BY s.id) 
SELECT id, first_name, last_name, total_sales 
FROM salesperson_sales
 ORDER BY total_sales DESC LIMIT 5;
```



In [190]:
query = """
WITH salesperson_sales AS (SELECT s._id, s.first_name, s.last_name, SUM(sa.sale_price) AS total_sales 
FROM salespersons AS s 
LEFT JOIN sales AS sa 
ON s._id = sa.salesperson_id 
GROUP BY s._id) 
SELECT _id, first_name, last_name, total_sales 
FROM salesperson_sales
 ORDER BY total_sales DESC LIMIT 5;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,_id,first_name,last_name,total_sales
0,2,Jane,Smith,278000.0
1,1,John,Doe,215000.0
2,6,Sarah,Taylor,116000.0
3,3,Michael,Johnson,96500.0
4,4,Emily,Brown,79500.0


In [485]:
%%pydough

result = PARTITION(Salespersons, name="sp", by=(_id, first_name, last_name)).CALCULATE(
    _id,
    first_name,
    last_name,
    total=SUM(sp.sales_made.sale_price)  
).TOP_K(5, by=total.DESC())

pydough.to_df(result)


Unnamed: 0,_id,first_name,last_name,total
0,2,Jane,Smith,278000.0
1,1,John,Doe,215000.0
2,6,Sarah,Taylor,116000.0
3,3,Michael,Johnson,96500.0
4,4,Emily,Brown,79500.0
