#### SQL-PYDOUGH CODE TESTING NOTEBOOK

Setup for the PyDough package is done on the next cell, run it to import necessary packages

In [1]:
import pydough

%load_ext pydough.jupyter_extensions
#%reload_ext pydough.jupyter_extensions

#Necessary for comparison
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import re
import dfcompare

import collections
import numpy as np
import sqlite3 as sql
import os


### Now we can set the SQLite database and connect it to PyDough. Please change the next strings to match: 
1. .sql filename to initialize the database
2. Metadata path for the graphs
3. Graph name of the graph you want to use

In [2]:
#YOUR .SQL FILE TO CREATE THE DATABASE, COPY IT TO THIS FOLDER.
SQL_filename = 'broker_sqlite.sql'

#METADATA FOR THE GRAPH .JSON
metadata_path = "../../tests/test_metadata/defog_graphs.json"

#GRAPH NAME
graph_name = "Broker"

#DESIRED DATABASE NAME
DB_name = "DATABASE.db"



with open(SQL_filename, 'r') as sql_file:
    sql_script = sql_file.read()

os.remove(DB_name)
connection = sql.connect(DB_name)
cursor = connection.cursor()
cursor.executescript(sql_script)

pydough.active_session.load_metadata_graph(metadata_path, graph_name)
pydough.active_session.connect_database("sqlite", database=DB_name)

DatabaseContext(connection=<pydough.database_connectors.database_connector.DatabaseConnection object at 0x7f70f4d83110>, dialect=<DatabaseDialect.SQLITE: 'sqlite'>)

### Graph Structure
In case you want to have the structure of the graph to understand the relations and names, you can run this next cell and select "View as a scrollable element" at the bottom of the result to be able to see the full structure in case the result does not display the complete list

In [19]:
graph = pydough.active_session.metadata
print(pydough.explain_structure(graph))

Structure of PyDough graph: Broker

  Customers
  ├── _id
  ├── address1
  ├── address2
  ├── city
  ├── country
  ├── email
  ├── join_date
  ├── name
  ├── phone
  ├── postal_code
  ├── state
  ├── status
  └── transactions_made [multiple Transactions] (reverse of Transactions.customer)

  DailyPrices
  ├── close
  ├── date
  ├── epoch_ms
  ├── high
  ├── low
  ├── open
  ├── source
  ├── ticker_id
  ├── volume
  └── ticker [one member of Tickers] (reverse of Tickers.historical_prices)

  Tickers
  ├── _id
  ├── currency
  ├── db2x
  ├── exchange
  ├── is_active
  ├── name
  ├── symbol
  ├── ticker_type
  ├── historical_prices [multiple DailyPrices] (reverse of DailyPrices.ticker)
  └── transactions_of [multiple Transactions] (reverse of Transactions.ticker)

  Transactions
  ├── amount
  ├── commission
  ├── currency
  ├── customer_id
  ├── date_time
  ├── kpx
  ├── price
  ├── settlement_date_str
  ├── shares
  ├── status
  ├── tax
  ├── ticker_id
  ├── transaction_id
  ├── transac

### SQL Test template
You can use this template to run your SQL code and visually compare the results to those of the PyDough code.
Just paste your SQL code inside the ''' ''''. You can also copy this template and paste is wherever you neet to.
Remember to use the column and table names from the original .sql file

In [99]:
query = '''
 SELECT
    *
 FROM
    sbCustomer
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName,sbCustEmail,sbCustPhone,sbCustAddress1,sbCustAddress2,sbCustCity,sbCustState,sbCustCountry,sbCustPostalCode,sbCustJoinDate,sbCustStatus
0,C001,john doe,john.doe@email.com,555-123-4567,123 Main St,,Anytown,CA,USA,90001,2020-01-01,active
1,C002,Jane Smith,jane.smith@email.com,555-987-6543,456 Oak Rd,,Someville,NY,USA,10002,2019-03-15,active
2,C003,Bob Johnson,bob.johnson@email.com,555-246-8135,789 Pine Ave,,Mytown,TX,USA,75000,2022-06-01,inactive
3,C004,Samantha Lee,samantha.lee@email.com,555-135-7902,246 Elm St,,Yourtown,CA,USA,92101,2018-09-22,suspended
4,C005,Michael Chen,michael.chen@email.com,555-864-2319,159 Cedar Ln,,Anothertown,FL,USA,33101,2021-02-28,active
5,C006,Emily Davis,emily.davis@email.com,555-753-1904,753 Maple Dr,,Mytown,TX,USA,75000,2020-07-15,active
6,C007,David Kim,david.kim@email.com,555-370-2648,864 Oak St,,Anothertown,FL,USA,33101,2022-11-05,active
7,C008,Sarah Nguyen,sarah.nguyen@email.com,555-623-7419,951 Pine Rd,,Yourtown,CA,USA,92101,2019-04-01,closed
8,C009,William Garcia,william.garcia@email.com,555-148-5326,258 Elm Ave,,Anytown,CA,USA,90001,2021-08-22,active
9,C010,Jessica Hernandez,jessica.hernandez@email.com,555-963-8520,147 Cedar Blvd,,Someville,NY,USA,10002,2020-03-10,inactive


### Pydough template
The important part about this template is to run the PyDough code and store it in a variable called pydough_output for future comparison.

In [111]:
%%pydough

#Setting up the tables that we will need information from in the context
tables = Customers

#The condition we would like the content to fulfill
filter = Customers

#The information we want to receive in the resulting table
output = filter

#Execute the PyDough code
pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,name,email,phone,address1,address2,city,state,country,postal_code,join_date,status
0,C001,john doe,john.doe@email.com,555-123-4567,123 Main St,,Anytown,CA,USA,90001,2020-01-01,active
1,C002,Jane Smith,jane.smith@email.com,555-987-6543,456 Oak Rd,,Someville,NY,USA,10002,2019-03-15,active
2,C003,Bob Johnson,bob.johnson@email.com,555-246-8135,789 Pine Ave,,Mytown,TX,USA,75000,2022-06-01,inactive
3,C004,Samantha Lee,samantha.lee@email.com,555-135-7902,246 Elm St,,Yourtown,CA,USA,92101,2018-09-22,suspended
4,C005,Michael Chen,michael.chen@email.com,555-864-2319,159 Cedar Ln,,Anothertown,FL,USA,33101,2021-02-28,active
5,C006,Emily Davis,emily.davis@email.com,555-753-1904,753 Maple Dr,,Mytown,TX,USA,75000,2020-07-15,active
6,C007,David Kim,david.kim@email.com,555-370-2648,864 Oak St,,Anothertown,FL,USA,33101,2022-11-05,active
7,C008,Sarah Nguyen,sarah.nguyen@email.com,555-623-7419,951 Pine Rd,,Yourtown,CA,USA,92101,2019-04-01,closed
8,C009,William Garcia,william.garcia@email.com,555-148-5326,258 Elm Ave,,Anytown,CA,USA,90001,2021-08-22,active
9,C010,Jessica Hernandez,jessica.hernandez@email.com,555-963-8520,147 Cedar Blvd,,Someville,NY,USA,10002,2020-03-10,inactive


### Comparison template 
Run this to compare the two data frames you have obtained as a result of the queries

In [55]:
dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="a")

True

SELECT t.sbTxType, COUNT(DISTINCT t.sbTxCustId) AS num_customers, AVG(t.sbTxShares) AS avg_shares FROM sbTransaction AS t WHERE t.sbTxDateTime BETWEEN '2023-01-01' AND '2023-03-31 23:59:59' GROUP BY t.sbTxType ORDER BY CASE WHEN num_customers IS NULL THEN 1 ELSE 0 END DESC, num_customers DESC LIMIT 3;

SELECT c.sbCustId, c.sbCustName FROM sbCustomer AS c LEFT JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxCustId IS NULL;

SELECT DISTINCT c.sbCustId FROM sbCustomer AS c JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxType = 'buy';

SELECT DISTINCT tk.sbTickerId FROM sbTicker AS tk JOIN sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId WHERE dp.sbDpDate >= '2023-04-01';

SELECT tk.sbTickerId, tk.sbTickerSymbol FROM sbTicker AS tk LEFT JOIN sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId WHERE dp.sbDpTickerId IS NULL;

SELECT tk.sbTickerSymbol, COUNT(tx.sbTxId) AS num_transactions, SUM(tx.sbTxAmount) AS total_amount FROM sbTicker AS tk JOIN sbTransaction AS tx ON tk.sbTickerId = tx.sbTxTickerId GROUP BY tk.sbTickerSymbol ORDER BY CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END DESC, total_amount DESC LIMIT 10;

SELECT sbTxStatus, COUNT(*) AS num_transactions FROM sbTransaction GROUP BY sbTxStatus ORDER BY CASE WHEN num_transactions IS NULL THEN 1 ELSE 0 END DESC, num_transactions DESC LIMIT 3;

SELECT c.sbCustState, t.sbTickerType, COUNT(*) AS num_transactions FROM sbTransaction AS tx JOIN sbCustomer AS c ON tx.sbTxCustId = c.sbCustId JOIN sbTicker AS t ON tx.sbTxTickerId = t.sbTickerId GROUP BY c.sbCustState, t.sbTickerType ORDER BY CASE WHEN num_transactions IS NULL THEN 1 ELSE 0 END DESC, num_transactions DESC LIMIT 5;

SELECT sbCustCountry, COUNT(*) AS num_customers FROM sbCustomer GROUP BY sbCustCountry ORDER BY CASE WHEN num_customers IS NULL THEN 1 ELSE 0 END DESC, num_customers DESC LIMIT 5;

SELECT c.sbCustCountry, COUNT(t.sbTxId) AS num_transactions, SUM(t.sbTxAmount) AS total_amount FROM sbCustomer AS c JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxDateTime >= DATE('now', '-30 days') GROUP BY c.sbCustCountry ORDER BY total_amount DESC LIMIT 5;

# 1.

In [48]:
query = '''
SELECT
    t.sbTxType,
    COUNT(DISTINCT t.sbTxCustId) AS num_customers,
    AVG(t.sbTxShares) AS avg_shares
FROM
    sbTransaction AS t
WHERE
    t.sbTxDateTime BETWEEN '2023-01-01' AND '2023-03-31 23:59:59'
GROUP BY
    t.sbTxType
ORDER BY
    CASE
        WHEN num_customers IS NULL THEN 1
        ELSE 0
    END DESC,
    num_customers DESC
LIMIT 3;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output


Unnamed: 0,sbTxType,num_customers,avg_shares
0,buy,3,41.75
1,sell,3,43.333333


In [54]:
%%pydough

tables = Transactions.WHERE((date_time >= '2023-01-01') & (date_time <= '2023-03-31 23:59:59'))

grouped = PARTITION(tables, name='t', by=transaction_type)(
                        transaction_type, num_customers = NDISTINCT(t.customer_id),
                        avg_shares = AVG(t.shares))

pydough_output = pydough.to_df(grouped)
pydough_output

Unnamed: 0,transaction_type,num_customers,avg_shares
0,buy,3,41.75
1,sell,3,43.333333


# 2.

In [146]:
query = '''
SELECT
    c.sbCustId,
    c.sbCustName
FROM
    sbCustomer AS c
LEFT JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxCustId IS NULL;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName
0,C011,Alex Rodriguez
1,C020,Maurice Lee


In [150]:
%%pydough

cust_no_trans = Customers.WHERE(HASNOT(transactions_made))

output = cust_no_trans(_id, name)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,name
0,C011,Alex Rodriguez
1,C020,Maurice Lee


In [151]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the customer ID and name of customers who have not made any transactions.")

True

# 3.

In [155]:
query = '''
SELECT DISTINCT
    c.sbCustId
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxType = 'buy';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId
0,C001
1,C003
2,C005
3,C009
4,C002
5,C004
6,C006
7,C008
8,C010
9,C007


In [154]:
%%pydough

tables = Customers.transactions_made

filter = tables.WHERE(transaction_type == 'buy')

output = filter(Cust_Id = BACK(1)._id)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,Cust_Id
0,C001
1,C003
2,C005
3,C003
4,C009
5,C002
6,C004
7,C006
8,C008
9,C010


In [157]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_distinct", question="Return the distinct list of customer IDs who have made a 'buy' transaction.")

np.True_

# 4.

In [158]:
query = '''
SELECT DISTINCT
    tk.sbTickerId
FROM
    sbTicker AS tk
JOIN
    sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId
WHERE
    dp.sbDpDate >= '2023-04-01';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerId
0,T001
1,T002
2,T003
3,T004
4,T005
5,T006
6,T007
7,T008
8,T009
9,T010


# 5.

In [159]:
query = '''
SELECT
    tk.sbTickerId,
    tk.sbTickerSymbol
FROM
    sbTicker AS tk
LEFT JOIN
    sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId
WHERE
    dp.sbDpTickerId IS NULL;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerId,sbTickerSymbol
0,T011,SPY
1,T012,QQQ
2,T013,VTI
3,T014,VXUS
4,T015,VFINX
5,T016,VTSAX
6,T017,VIGAX
7,T018,GOOG


In [162]:
%%pydough

tables = Tickers

filter = tables.WHERE(HASNOT(historical_prices))

output = filter(_id, symbol)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,symbol
0,T011,SPY
1,T012,QQQ
2,T013,VTI
3,T014,VXUS
4,T015,VFINX
5,T016,VTSAX
6,T017,VIGAX
7,T018,GOOG


In [165]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the ticker ID and symbol of tickers that do not have any daily price records.")

True

# 6.

In [211]:
query = '''
SELECT
    tk.sbTickerSymbol,
    COUNT(tx.sbTxId) AS num_transactions,
    SUM(tx.sbTxAmount) AS total_amount
FROM
    sbTicker AS tk
JOIN
    sbTransaction AS tx ON tk.sbTickerId = tx.sbTxTickerId
GROUP BY
    tk.sbTickerSymbol
ORDER BY
    CASE
        WHEN total_amount IS NULL THEN 1
        ELSE 0
    END DESC,
    total_amount DESC
LIMIT 10;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,num_transactions,total_amount
0,BRK.B,5,1246600.0
1,AMZN,5,138126.0
2,AAPL,9,132293.75
3,MSFT,6,76057.5
4,JPM,4,69791.25
5,FB,6,68780.0
6,GOOGL,5,55190.0
7,TSLA,6,52792.5
8,PG,3,29680.0
9,V,3,25427.5


In [210]:
%%pydough

tables = Tickers.transactions_of(single_amount = amount, transactions  = transaction_id, symbols = BACK(1).symbol)

filter = PARTITION(tables, name="t", by=(symbols))(symbols, num_transactions = COUNT(t.transactions), total_amount = SUM(t.single_amount))

output = filter.TOP_K(10, by = total_amount.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,symbols,num_transactions,total_amount
0,BRK.B,5,1246600.0
1,AMZN,5,138126.0
2,AAPL,9,132293.75
3,MSFT,6,76057.5
4,JPM,4,69791.25
5,FB,6,68780.0
6,GOOGL,5,55190.0
7,TSLA,6,52792.5
8,PG,3,29680.0
9,V,3,25427.5


In [212]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the ticker ID and symbol of tickers that do not have any daily price records.")

True

# 7.

In [11]:
query = '''
SELECT
    sbTxStatus,
    COUNT(*) AS num_transactions
FROM
    sbTransaction
GROUP BY
    sbTxStatus
ORDER BY
    CASE
        WHEN num_transactions IS NULL THEN 1
        ELSE 0
    END DESC,
    num_transactions DESC
LIMIT 3;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTxStatus,num_transactions
0,success,50
1,fail,4
2,pending,2


In [10]:
%%pydough

tables = Transactions

filter = PARTITION(tables, name="t", by=(status))(status, num_transactions = COUNT(t))

output = filter.TOP_K(10, by = num_transactions.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,status,num_transactions
0,success,50
1,fail,4
2,pending,2


# 8.

In [17]:
query = '''
SELECT
    c.sbCustState,
    t.sbTickerType,
    COUNT(*) AS num_transactions
FROM
    sbTransaction AS tx
JOIN
    sbCustomer AS c ON tx.sbTxCustId = c.sbCustId
JOIN
    sbTicker AS t ON tx.sbTxTickerId = t.sbTickerId
GROUP BY
    c.sbCustState,
    t.sbTickerType
ORDER BY
    CASE
        WHEN num_transactions IS NULL THEN 1
        ELSE 0
    END DESC,
    num_transactions DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustState,sbTickerType,num_transactions
0,CA,stock,18
1,NY,stock,12
2,TX,stock,10
3,FL,stock,8
4,NJ,stock,4


In [18]:
%%pydough

tables = Transactions(t_type = ticker.ticker_type, cust_state = customer.state)

filter = PARTITION(tables, name="t", by=(t_type, cust_state))(cust_state, t_type, num_transactions = COUNT(t))

output = filter.TOP_K(5, by = num_transactions.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,cust_state,t_type,num_transactions
0,CA,stock,18
1,NY,stock,12
2,TX,stock,10
3,FL,stock,8
4,NJ,stock,4


# 9.

In [19]:
query = '''
SELECT
    sbCustCountry,
    COUNT(*) AS num_customers
FROM
    sbCustomer
GROUP BY
    sbCustCountry
ORDER BY
    CASE
        WHEN num_customers IS NULL THEN 1
        ELSE 0
    END DESC,
    num_customers DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustCountry,num_customers
0,USA,20


In [None]:
%%pydough

tables = Customers

filter = PARTITION(tables, name="t", by=(country))(country, num_customers = COUNT(t))

output = filter.TOP_K(5, by = num_customers.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,country,num_customers
0,USA,20


# 10. 
    Time is involved, will test again when time functions get added

In [3]:
query = '''
SELECT
    c.sbCustCountry,
    COUNT(t.sbTxId) AS num_transactions,
    SUM(t.sbTxAmount) AS total_amount
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxDateTime >= DATE('now', '-30 days')
GROUP BY
    c.sbCustCountry
ORDER BY
    total_amount DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustCountry,num_transactions,total_amount
0,USA,15,293600.0


In [None]:
%%pydough

tables = Customers.WHERE(transactions.date_time (TimeOperation))(trans_id = transactions.transaction_id, total_amount = transactions.amount)

filter = PARTITION(tables, name="t", by=(country))(country, num_transactions = COUNT(t.trans_id), total_amount = SUM(ticker_amount))

output = filter.TOP_K(5, by = num_customers.DESC())

pydough_output = pydough.to_df(output)
pydough_output

# 11. Advanced 1

For customers with at least 5 total transactions, what is their transaction success rate? Return the customer name and success rate, ordered from lowest to highest success rate.

In [12]:
query = '''
WITH cust_tx_stats AS (
    SELECT
        c.sbCustId,
        c.sbCustName,
        COUNT(t.sbTxId) AS total_tx,
        SUM(CASE WHEN t.sbTxStatus = 'success' THEN 1 ELSE 0 END) AS success_tx
    FROM
        sbCustomer AS c
    JOIN
        sbTransaction AS t ON c.sbCustId = t.sbTxCustId
    GROUP BY
        c.sbCustId,
        c.sbCustName
)
SELECT
    sbCustName,
    CAST(success_tx AS FLOAT) / total_tx * 100 AS success_rate
FROM
    cust_tx_stats
WHERE
    total_tx >= 5
ORDER BY
    CASE
        WHEN success_rate IS NULL THEN 1
        ELSE 0
    END,
    success_rate;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustName,success_rate
0,john doe,60.0
1,Jane Smith,87.5
2,Bob Johnson,100.0


In [28]:
%%pydough

tables = Transactions(cust_id = customer._id, cust_name = customer.name)

tx_success = PARTITION(tables, name="t", by=(cust_id, cust_name))(cust_name, total_tx = COUNT(t.transaction_id), success_tx = COUNT(t.WHERE(status == "success")))

tx_rate = tx_success(success_rate = success_tx / total_tx * 100).WHERE(total_tx >= 5)

output = tx_rate(cust_name, success_rate)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,cust_name,success_rate
0,Bob Johnson,100.0
1,Jane Smith,87.5
2,john doe,60.0


In [29]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", question="For customers with at least 5 total transactions, what is their transaction success rate? Return the customer name and success rate, ordered from lowest to highest success rate.")

np.True_

# 12.

In [71]:
query = '''
SELECT
    COUNT(DISTINCT t.sbTxCustId)
FROM
    sbTransaction AS t
JOIN
    sbCustomer AS c ON t.sbTxCustId = c.sbCustId
JOIN
    sbTicker AS tk ON t.sbTxTickerId = tk.sbTickerId
WHERE
    c.sbCustEmail LIKE '%.com'
    AND (
        tk.sbTickerSymbol LIKE 'AMZN'
        OR tk.sbTickerSymbol LIKE 'AAPL'
        OR tk.sbTickerSymbol LIKE 'GOOGL'
        OR tk.sbTickerSymbol LIKE 'META'
        OR tk.sbTickerSymbol LIKE 'NFLX'
    );
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(DISTINCT t.sbTxCustId)
0,9


In [75]:
%%pydough

tables = Transactions(cust_email = customer.email, ticker_symbol = ticker.symbol, cust_id = customer._id)

filter = tables.WHERE(CONTAINS(cust_email, '%.com%') & LIKE(ticker_symbol, 'AAPL') 
                      | LIKE(ticker_symbol, 'GOOGL')| LIKE(ticker_symbol, 'META')
                      | LIKE(ticker_symbol, 'NFLX') | LIKE(ticker_symbol, 'AMZN'))

output = Broker(distinct_ids = NDISTINCT(filter.customer._id))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,distinct_ids
0,9


# 13.

In [80]:
query = '''
WITH cust_tx_counts AS (
    SELECT
        sbTxCustId,
        COUNT(*) AS num_tx,
        SUM(sbTxAmount) AS total_amount
    FROM
        sbTransaction
    GROUP BY
        sbTxCustId
)
SELECT
    c.sbCustName,
    ct.num_tx,
    ct.total_amount,
    RANK() OVER (ORDER BY CASE WHEN ct.total_amount IS NULL THEN 1 ELSE 0 END DESC, ct.total_amount DESC) AS cust_rank
FROM
    cust_tx_counts AS ct
JOIN
    sbCustomer AS c ON ct.sbTxCustId = c.sbCustId;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustName,num_tx,total_amount,cust_rank
0,David Kim,3,828200.0,1
1,Bob Johnson,8,529776.0,2
2,Jane Smith,8,87557.5,3
3,john doe,5,68993.75,4
4,Sarah Nguyen,3,56791.25,5
5,Michael Chen,4,52090.0,6
6,Samantha Lee,4,48492.5,7
7,Jacob Taylor,1,48000.0,8
8,William Garcia,4,43827.5,9
9,Emily Davis,3,43480.0,10


In [104]:
%%pydough

tables = Transactions(cust_name = customer.name)

id_grouped_tx = PARTITION(tables, name="t", by=(customer_id, cust_name))(num_tx = COUNT(t.customer_id), total_amount = SUM(t.amount))

output = id_grouped_tx(cust_name, total_amount, cust_rank = RANKING(by=total_amount.DESC(), levels=1))

pydough_output = pydough.to_df(output)
pydough_output



Unnamed: 0,cust_name,total_amount,cust_rank
0,David Kim,828200.0,1
1,Bob Johnson,529776.0,2
2,Jane Smith,87557.5,3
3,john doe,68993.75,4
4,Sarah Nguyen,56791.25,5
5,Michael Chen,52090.0,6
6,Samantha Lee,48492.5,7
7,Jacob Taylor,48000.0,8
8,William Garcia,43827.5,9
9,Emily Davis,43480.0,10


# 14. 

In [7]:
query = '''
SELECT
    COUNT(sbCustId)
FROM
    sbCustomer
WHERE
    (LOWER(sbCustName) LIKE 'j%' OR LOWER(sbCustName) LIKE '%ez')
    AND LOWER(sbCustState) LIKE '%a';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(sbCustId)
0,2


In [None]:
%%pydough

tables = Customers

filter = tables.WHERE((STARTSWITH(LOWER(name), 'j%') | ENDSWITH(LOWER(name), '%ez')) & LIKE(state, '%a'))

output = Broker(COUNT(filter))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_expr0
0,2


# 15.

In [4]:
query = '''
SELECT
    sbTickerSymbol,
    CASE
        WHEN SUM(sbTxAmount) = 0 THEN NULL
        ELSE (SUM(sbTxAmount) - SUM(sbTxTax + sbTxCommission)) / SUM(sbTxAmount) * 100
    END AS SPM
FROM
    sbTransaction
JOIN
    sbTicker ON sbTransaction.sbTxTickerId = sbTicker.sbTickerId
WHERE
    sbTxType = 'sell'
    AND sbTxDateTime >= DATE('now', '-1 month')
GROUP BY
    sbTickerSymbol;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,SPM
0,FB,99.4
1,JPM,99.428571
2,MSFT,99.428571
3,QQQ,99.34375
4,TSLA,99.458333
5,VXUS,99.409091


In [17]:
%%pydough

tables = Transactions

this_month = DATEDIFF("months", date_time, 'now')
filter = tables.WHERE(LIKE(transaction_type, 'sell') & (DATEDIFF("months", date_time, 'now') < 1))(ticker_symbol = ticker.symbol)
output = PARTITION(filter, name = 't', by= ticker_symbol)(ticker_symbol, SPM = (SUM(t.amount) - SUM(t.tax + t.commission))/SUM(t.amount) * 100)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,ticker_symbol,SPM,avgam
0,FB,99.4,12500.0
1,JPM,99.428571,35000.0
2,MSFT,99.428571,14000.0
3,QQQ,99.34375,9600.0
4,TSLA,99.458333,28800.0
5,VXUS,99.409091,13200.0


# 16.

In [20]:
query = '''
SELECT
    sbCustCountry,
    COUNT(sbCustId) AS TAC
FROM
    sbCustomer
WHERE
    sbCustJoinDate >= '2023-01-01'
GROUP BY
    sbCustCountry;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output


Unnamed: 0,sbCustCountry,TAC
0,USA,10


In [25]:
%%pydough

tables = Customers

filter = tables.WHERE(join_date >= '2023-01-01')

group = PARTITION(filter, name='f',by= country)
output = group(cust_country = country, TAC = COUNT(f._id))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,cust_country,TAC
0,USA,10


# 17.

In [59]:
query = '''
SELECT
    COUNT(DISTINCT sb.sbTxId) AS num_transactions,
    SUM(sb.sbTxAmount) AS total_transaction_amount
FROM
    sbTransaction AS sb
JOIN
    sbCustomer AS sc ON sb.sbTxCustId = sc.sbCustId
WHERE
    LOWER(sc.sbCustCountry) = 'usa'
    AND sb.sbTxDateTime >= DATE('now', '-' || ((strftime('%w', 'now') + 6) % 7) || ' days', '-7 days')
    AND sb.sbTxDateTime < DATE('now', '-' || ((strftime('%w', 'now') + 6) % 7) || ' days');
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,num_transactions,total_transaction_amount
0,9,201600.0


In [57]:
%%pydough

tables = Transactions

filter = tables.WHERE(DATEDIFF("days", date_time, 'now')>= 7).WHERE( DATEDIFF("days", date_time, 'now')<= 14)

output = Broker(num_transactions=COUNT(filter), total_transaction_amount=SUM(filter.amount))
pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.
DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,num_transactions,total_transaction_amount
0,8,139000.0


# 18.

In [60]:
query = '''
SELECT DISTINCT
    tk.sbTickerId
FROM
    sbTicker AS tk
JOIN
    sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId
WHERE
    dp.sbDpDate >= '2023-04-01';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerId
0,T001
1,T002
2,T003
3,T004
4,T005
5,T006
6,T007
7,T008
8,T009
9,T010


In [5]:
%%pydough

tables = DailyPrices

filter = tables.WHERE(date >= '2023-04-01')(tickers_id = ticker._id)

output = PARTITION(filter, name="o", by=tickers_id)
pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,tickers_id
0,T001
1,T002
2,T003
3,T004
4,T005
5,T006
6,T007
7,T008
8,T009
9,T010


# 19.

In [110]:
query = '''
WITH popular_stocks AS (
    SELECT
        t.sbTickerSymbol,
        COUNT(*) AS tx_count
    FROM
        sbTransaction AS tx
    JOIN
        sbTicker AS t ON tx.sbTxTickerId = t.sbTickerId
    WHERE
        tx.sbTxType = 'buy'
        AND tx.sbTxDateTime >= DATE('now', '-10 days')
    GROUP BY
        t.sbTickerSymbol
)
SELECT
    sbTickerSymbol,
    tx_count
FROM
    popular_stocks
ORDER BY
    tx_count DESC
LIMIT 2;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,tx_count
0,AAPL,3
1,BRK.B,2


In [4]:
%%pydough

tables = Transactions

filter = tables.WHERE(LIKE(transaction_type, 'buy') & (DATEDIFF('days', date_time, 'now') <= 10))(ticker_symbol = ticker.symbol)

grouped = PARTITION(filter, name='g', by=ticker_symbol)(ticker_symbol, tx_count = COUNT(g))

output = grouped.TOP_K(2, by = tx_count.DESC())
pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,ticker_symbol,tx_count
0,AAPL,3
1,BRK.B,2


# 20.

In [111]:

query = '''
SELECT
    strftime('%Y-%m-01 %H:%M:%S', sbCustJoinDate) AS MONTH,
    COUNT(sbCustId) AS customer_signups,
    AVG(t.sbTxAmount) AS avg_tx_amount
FROM
    sbCustomer AS c
LEFT JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
    AND strftime('%Y-%m', t.sbTxDateTime) = strftime('%Y-%m', c.sbCustJoinDate)
WHERE
    sbCustJoinDate >= date('now', '-6 months', 'start of month')
    AND sbCustJoinDate < date('now', 'start of month')
GROUP BY
    MONTH;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,MONTH,customer_signups,avg_tx_amount
0,2024-09-01 00:00:00,1,7500.0
1,2024-10-01 00:00:00,1,11200.0
2,2024-11-01 00:00:00,1,48000.0
3,2024-12-01 00:00:00,1,5400.0
4,2025-01-01 00:00:00,1,25000.0


In [7]:
%%pydough

tables = Transactions

filter = tables.WHERE((YEAR(customer.join_date) == YEAR(date_time)) 
                      & (MONTH(customer.join_date) == MONTH(date_time)) 
                      & (DATEDIFF("months", customer.join_date, 'now') < 6)
                      & (DATEDIFF("months", customer.join_date, 'now') >= 1))

month = filter(cust_id = customer._id, month_joined = (YEAR(customer.join_date) * 100 + MONTH(customer.join_date)))

grouped = PARTITION(month, name='m', by=month_joined)

output = grouped(month_joined, customer_signups = COUNT(m.cust_id), avg_tx_amount = AVG(m.amount))
pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,month_joined,customer_signups,avg_tx_amount
0,202409,1,7500.0
1,202410,1,11200.0
2,202411,1,48000.0
3,202412,1,5400.0
4,202501,1,25000.0


# 21.

In [16]:
query = '''
SELECT
    sbTicker.sbTickerType,
    AVG(sbDailyPrice.sbDpClose) AS ACP
FROM
    sbDailyPrice
JOIN
    sbTicker ON sbDailyPrice.sbDpTickerId = sbTicker.sbTickerId
WHERE
    sbDpDate >= DATE('now', '-7 days')
GROUP BY
    sbTicker.sbTickerType;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerType,ACP
0,etf,209.75
1,mutualfund,84.3
2,stock,311.5


In [5]:
%%pydough

tables = DailyPrices

filter = tables.WHERE(DATEDIFF("days", date, 'now') <7)(ticker_type = ticker.ticker_type)

grouped = PARTITION(filter, name='g', by = ticker_type)

output = grouped(ticker_type, ACP = AVG(g.close))
pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,ticker_type,ACP
0,etf,209.75
1,mutualfund,84.3
2,stock,311.5


# 22.

In [20]:
query = '''
SELECT
    c.sbCustCountry,
    COALESCE(
        100.0 * COUNT(DISTINCT CASE WHEN c.sbCustStatus = 'active' THEN c.sbCustId END) / NULLIF(COUNT(DISTINCT t.sbTxCustId), 0),
        0
    ) AS AR
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    c.sbCustJoinDate BETWEEN '2022-01-01' AND '2022-12-31'
GROUP BY
    c.sbCustCountry;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustCountry,AR
0,USA,50.0


In [None]:
Will ask tomorrow

# 22.

In [4]:
query = '''
WITH monthly_price_stats AS (
    SELECT
        strftime('%Y-%m-01 %H:%M:%S', sbDpDate) AS month,
        sbDpTickerId,
        AVG(sbDpClose) AS avg_close,
        MAX(sbDpHigh) AS max_high,
        MIN(sbDpLow) AS min_low
    FROM
        sbDailyPrice
    GROUP BY
        month,
        sbDpTickerId
)
SELECT
    t.sbTickerSymbol,
    mps.month,
    mps.avg_close,
    mps.max_high,
    mps.min_low,
    (mps.avg_close - LAG(mps.avg_close) OVER (PARTITION BY mps.sbDpTickerId ORDER BY mps.month)) / LAG(mps.avg_close) OVER (PARTITION BY mps.sbDpTickerId ORDER BY mps.month) AS mom_change
FROM
    monthly_price_stats AS mps
JOIN
    sbTicker AS t ON mps.sbDpTickerId = t.sbTickerId;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,month,avg_close,max_high,min_low,mom_change
0,AAPL,2023-04-01 00:00:00,152.25,154.0,148.75,
1,MSFT,2023-04-01 00:00:00,282.583333,285.0,279.5,
2,AMZN,2023-04-01 00:00:00,3223.333333,3240.0,3180.0,
3,TSLA,2023-04-01 00:00:00,186.0,188.5,178.5,
4,GOOGL,2023-04-01 00:00:00,2521.666667,2540.0,2475.0,
5,FB,2023-04-01 00:00:00,204.833333,208.0,198.0,
6,BRK.B,2023-04-01 00:00:00,402166.666667,404000.0,398000.0,
7,JPM,2023-04-01 00:00:00,132.416667,134.5,128.75,
8,V,2023-04-01 00:00:00,222.166667,224.0,218.0,
9,PG,2023-04-01 00:00:00,142.333333,144.0,139.0,


In [13]:
%%pydough

tables = DailyPrices(_month = (YEAR(date) * 100 + MONTH(date)), ticker_symbol = ticker.symbol)

grouped = PARTITION(tables, name='g', by = (ticker_id, ticker_symbol, _month))(
            ticker_symbol, _month, ticker_id, avg_close = AVG(g.close), 
             max_high = MAX(g.high), min_low = MIN(g.low))

#I don't understand the mom_change yet, will ask

output = grouped
pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,ticker_symbol,_month,ticker_id,avg_close,max_high,min_low
0,AAPL,202304,T001,152.25,154.0,148.75
1,AMZN,202304,T003,3223.333333,3240.0,3180.0
2,BRK.B,202304,T007,402166.666667,404000.0,398000.0
3,FB,202304,T006,204.833333,208.0,198.0
4,GOOGL,202304,T005,2521.666667,2540.0,2475.0
5,JPM,202304,T008,132.416667,134.5,128.75
6,MSFT,202304,T002,282.583333,285.0,279.5
7,NFLX,202502,T021,311.5,323.0,297.5
8,PG,202304,T010,142.333333,144.0,139.0
9,TSLA,202304,T004,186.0,188.5,178.5


# 23.

In [15]:
query = '''
WITH stock_stats AS (
    SELECT
        t.sbTickerSymbol,
        MIN(d.sbDpLow) AS min_price,
        MAX(d.sbDpHigh) AS max_price
    FROM
        sbDailyPrice AS d
    JOIN
        sbTicker AS t ON d.sbDpTickerId = t.sbTickerId
    WHERE
        d.sbDpDate BETWEEN '2023-04-01' AND '2023-04-04'
    GROUP BY
        t.sbTickerSymbol
)
SELECT
    sbTickerSymbol,
    max_price - min_price AS price_change
FROM
    stock_stats
ORDER BY
    CASE
        WHEN price_change IS NULL THEN 1
        ELSE 0
    END DESC,
    price_change DESC
LIMIT 3;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,price_change
0,BRK.B,6000.0
1,GOOGL,65.0
2,AMZN,60.0


In [32]:
%%pydough

tables = DailyPrices.WHERE((date >= '2023-04-01') & (date <= '2023-04-04'))(ticker_symbol = ticker.symbol)

grouped = PARTITION(tables, name='g', by = ticker_symbol)(
            ticker_symbol, max_price = MAX(g.high), min_price = MIN(g.low))

price_changed = grouped(ticker_symbol, price_change = (max_price - min_price))

output = price_changed.TOP_K(3, price_change.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,ticker_symbol,price_change
0,BRK.B,6000.0
1,GOOGL,65.0
2,AMZN,60.0


# 24.

In [33]:
query = '''
WITH active_customers AS (
    SELECT
        c.sbCustId,
        COUNT(t.sbTxId) AS num_transactions
    FROM
        sbCustomer AS c
    JOIN
        sbTransaction AS t ON c.sbCustId = t.sbTxCustId
        AND strftime('%Y-%m', c.sbCustJoinDate) = strftime('%Y-%m', t.sbTxDateTime)
    GROUP BY
        c.sbCustId
)
SELECT
    ac.sbCustId,
    c.sbCustName,
    ac.num_transactions
FROM
    active_customers AS ac
JOIN
    sbCustomer AS c ON ac.sbCustId = c.sbCustId
ORDER BY
    ac.num_transactions DESC
LIMIT 1;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName,num_transactions
0,C012,Olivia Johnson,3


In [42]:
%%pydough

tables = Transactions.WHERE((YEAR(customer.join_date) == YEAR(date_time)) 
                      & (MONTH(customer.join_date) == MONTH(date_time)))(
                        cust_id = customer._id, cust_name = customer.name)

grouped = PARTITION(tables, name = 'g', by = (cust_id, cust_name))(cust_id, cust_name, num_transactions = COUNT(g.transaction_id))

output = grouped.TOP_K(1, num_transactions.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,cust_id,cust_name,num_transactions
0,C012,Olivia Johnson,3


# 25.

In [45]:

query = '''
WITH cust_tx AS (
    SELECT
        c.sbCustId,
        c.sbCustName,
        SUM(t.sbTxAmount) AS total_amount
    FROM
        sbCustomer AS c
    JOIN
        sbTransaction AS t ON c.sbCustId = t.sbTxCustId
    GROUP BY
        c.sbCustId,
        c.sbCustName
)
SELECT
    sbCustName,
    total_amount
FROM
    cust_tx
ORDER BY
    CASE
        WHEN total_amount IS NULL THEN 1
        ELSE 0
    END DESC,
    total_amount DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustName,total_amount
0,David Kim,828200.0
1,Bob Johnson,529776.0
2,Jane Smith,87557.5
3,john doe,68993.75
4,Sarah Nguyen,56791.25


In [44]:
%%pydough

tables = Transactions(cust_id = customer._id, cust_name = customer.name)

grouped = PARTITION(tables, name = 'g', by = (cust_id, cust_name))(cust_name, total_amount = SUM(g.amount))

output = grouped.TOP_K(5, total_amount.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,cust_name,total_amount
0,David Kim,828200.0
1,Bob Johnson,529776.0
2,Jane Smith,87557.5
3,john doe,68993.75
4,Sarah Nguyen,56791.25


# 26.

In [56]:
query = '''
SELECT
    c.sbCustCountry,
    COUNT(t.sbTxId) AS num_transactions,
    SUM(t.sbTxAmount) AS total_amount
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxDateTime >= DATE('now', '-30 days')
GROUP BY
    c.sbCustCountry
ORDER BY
    total_amount DESC
LIMIT 5;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustCountry,num_transactions,total_amount
0,USA,15,293600.0


In [61]:
%%pydough

tables = Transactions.WHERE(DATEDIFF("days", date_time, 'now') < 30)(cust_country = customer.country)

grouped = PARTITION(tables, name = 'g', by = (cust_country))(
                    cust_country, num_transactions = COUNT(g.transaction_id),
                    total_amount = SUM(g.amount))

output = grouped.TOP_K(5, total_amount.DESC())

pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,cust_country,num_transactions,total_amount
0,USA,15,293600.0


# 27.

In [64]:
query = '''
SELECT
    MIN(sdp.sbDpClose) AS lowest_price
FROM
    sbDailyPrice AS sdp
JOIN
    sbTicker AS st ON sdp.sbDpTickerId = st.sbTickerId
WHERE
    st.sbTickerSymbol = 'VTI'
    AND sdp.sbDpDate >= date('now', '-7 days');
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,lowest_price
0,206.25


In [70]:
%%pydough

tables = DailyPrices.WHERE((ticker.symbol == 'VTI') & (DATEDIFF("days", date, 'now') <= 7))

output = Broker(lowest_price = MIN(tables.close))

pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,lowest_price
0,206.25


# 28.

In [71]:
query = '''
SELECT
    COUNT(t.sbTxCustId) AS transaction_count
FROM
    sbTransaction AS t
JOIN
    sbCustomer AS c ON t.sbTxCustId = c.sbCustId
WHERE
    c.sbCustJoinDate >= date('now', '-70 days');
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,transaction_count
0,2


In [72]:
%%pydough

tables = Transactions.WHERE((DATEDIFF("days",customer.join_date, 'now') <= 70))

output = Broker(transaction_count = COUNT(tables.customer_id))

pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,transaction_count
0,2


# 29.

In [76]:
query = '''
SELECT
    c.sbCustId,
    MIN(julianday(t.sbTxDateTime)) - julianday(c.sbCustJoinDate) AS DaysFromJoinToFirstTransaction
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
GROUP BY
    c.sbCustId;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,DaysFromJoinToFirstTransaction
0,C001,1186.395833
1,C002,1478.427083
2,C003,304.458333
3,C004,1653.489583
4,C005,762.520833
5,C006,991.552083
6,C007,148.583333
7,C008,1462.614583
8,C009,587.645833
9,C010,1118.677083


In [80]:
%%pydough

tables = Transactions(cust_id = customer._id, cust_joins = customer.join_date)

grouped = PARTITION(tables, name = 'g', by=(cust_id, cust_joins))(first_tx = MIN(g.date_time))

output = grouped(cust_id, DaysFromJoinToFirstTransaction = DATEDIFF("days", cust_joins, first_tx))

pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,cust_id,DaysFromJoinToFirstTransaction
0,C004,1653
1,C002,1478
2,C008,1462
3,C001,1186
4,C010,1118
5,C006,991
6,C005,762
7,C009,587
8,C003,304
9,C007,148


# 30.

In [83]:
query = '''
WITH SellTransactions AS (
    SELECT
        sbTxCustId,
        COUNT(*) AS num_tx
    FROM
        sbTransaction
    WHERE
        DATE(sbTxDateTime) = '2023-04-01'
        AND sbTxType = 'sell'
    GROUP BY
        sbTxCustId
)
SELECT
    c.sbCustId,
    c.sbCustName,
    st.num_tx
FROM
    sbCustomer AS c
JOIN
    SellTransactions AS st ON c.sbCustId = st.sbTxCustId
ORDER BY
    st.num_tx DESC NULLS FIRST
LIMIT 1;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName,num_tx
0,C002,Jane Smith,3


In [89]:
%%pydough

tables = Transactions.WHERE((DATEDIFF("days", date_time, '2023-04-01') == 0)
                            & (transaction_type == 'sell'))

cust_info = tables(cust_id = customer._id, cust_name = customer.name)

grouped = PARTITION(cust_info, name = 'c', by=(cust_id, cust_name))(cust_id, cust_name, num_tx = COUNT(c))

output = grouped.TOP_K(1, by=num_tx.DESC(na_pos="first"))

pydough_output = pydough.to_df(output)
pydough_output

DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,cust_id,cust_name,num_tx
0,C002,Jane Smith,3


# 31.

In [90]:
query = '''
SELECT
    strftime('%Y-%m', sbTxDateTime) AS month,
    AVG(sbTxPrice) AS avg_price
FROM
    sbTransaction
WHERE
    sbTxStatus = 'success'
    AND sbTxDateTime BETWEEN '2023-01-01' AND '2023-03-31'
GROUP BY
    month
ORDER BY
    month;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,datetime,avg_price
0,2023-01-01 00:00:00,166.666667
1,2023-02-01 00:00:00,1417.5
2,2023-03-01 00:00:00,1730.0


In [None]:
%%pydough

tables = Transactions.WHERE((DATEDIFF("days", '2023-01-01', date_time) >= 0 )
                            & (DATEDIFF("days", date_time, '2023-03-31') >= 0) 
                            & (status == 'success'))

month = tables(_month = (YEAR(date_time) * 100 + MONTH(date_time)))

grouped = PARTITION(month, name = 'm', basasdy=(_month))(_month, avg_price = AVG(m.price))

output = grouped.ORDER_BY(_month.DESC())

pydough_output = pydough.to_df(grouped)
pydough_output

DATEDIFF unsupported for 'DAYS'.
DATEDIFF unsupported for 'DAYS'.


Unnamed: 0,_month,avg_price
0,202301,166.666667
1,202302,1417.5
2,202303,1730.0
