#### SQL-PYDOUGH CODE TESTING NOTEBOOK

Setup for the PyDough package is done on the next cell, run it to import necessary packages

In [None]:
import pydough

%load_ext pydough.jupyter_extensions
#%reload_ext pydough.jupyter_extensions

#Necessary for comparison
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import re
import dfcompare

import collections
import numpy as np
import sqlite3 as sql
import os


### Now we can set the SQLite database and connect it to PyDough. Please change the next strings to match: 
1. .sql filename to initialize the database
2. Metadata path for the graphs
3. Graph name of the graph you want to use

In [97]:
#YOUR .SQL FILE TO CREATE THE DATABASE, COPY IT TO THIS FOLDER.
SQL_filename = 'broker_sqlite.sql'

#METADATA FOR THE GRAPH .JSON
metadata_path = "../../tests/test_metadata/defog_graphs.json"

#GRAPH NAME
graph_name = "Broker"

#DESIRED DATABASE NAME
DB_name = "DATABASE.db"



with open(SQL_filename, 'r') as sql_file:
    sql_script = sql_file.read()

os.remove(DB_name)
connection = sql.connect(DB_name)
cursor = connection.cursor()
cursor.executescript(sql_script)

pydough.active_session.load_metadata_graph(metadata_path, graph_name)
pydough.active_session.connect_database("sqlite", database=DB_name)

DatabaseContext(connection=<pydough.database_connectors.database_connector.DatabaseConnection object at 0x7f8707cb4830>, dialect=<DatabaseDialect.SQLITE: 'sqlite'>)

### Graph Structure
In case you want to have the structure of the graph to understand the relations and names, you can run this next cell and select "View as a scrollable element" at the bottom of the result to be able to see the full structure in case the result does not display the complete list

In [98]:
graph = pydough.active_session.metadata
print(pydough.explain_structure(graph))

Structure of PyDough graph: Broker

  Customers
  ├── _id
  ├── address1
  ├── address2
  ├── city
  ├── country
  ├── email
  ├── join_date
  ├── name
  ├── phone
  ├── postal_code
  ├── state
  ├── status
  └── transactions_made [multiple Transactions] (reverse of Transactions.customer)

  DailyPrices
  ├── close
  ├── date
  ├── epoch_ms
  ├── high
  ├── low
  ├── open
  ├── source
  ├── ticker_id
  ├── volume
  └── ticker [one member of Tickers] (reverse of Tickers.historical_prices)

  Tickers
  ├── _id
  ├── currency
  ├── db2x
  ├── exchange
  ├── is_active
  ├── name
  ├── symbol
  ├── ticker_type
  ├── historical_prices [multiple DailyPrices] (reverse of DailyPrices.ticker)
  └── transactions_of [multiple Transactions] (reverse of Transactions.ticker)

  Transactions
  ├── amount
  ├── commission
  ├── currency
  ├── customer_id
  ├── date_time
  ├── kpx
  ├── price
  ├── settlement_date_str
  ├── shares
  ├── status
  ├── tax
  ├── ticker_id
  ├── transaction_id
  ├── transac

### SQL Test template
You can use this template to run your SQL code and visually compare the results to those of the PyDough code.
Just paste your SQL code inside the ''' ''''. You can also copy this template and paste is wherever you neet to.
Remember to use the column and table names from the original .sql file

In [99]:
query = '''
 SELECT
    *
 FROM
    sbCustomer
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName,sbCustEmail,sbCustPhone,sbCustAddress1,sbCustAddress2,sbCustCity,sbCustState,sbCustCountry,sbCustPostalCode,sbCustJoinDate,sbCustStatus
0,C001,john doe,john.doe@email.com,555-123-4567,123 Main St,,Anytown,CA,USA,90001,2020-01-01,active
1,C002,Jane Smith,jane.smith@email.com,555-987-6543,456 Oak Rd,,Someville,NY,USA,10002,2019-03-15,active
2,C003,Bob Johnson,bob.johnson@email.com,555-246-8135,789 Pine Ave,,Mytown,TX,USA,75000,2022-06-01,inactive
3,C004,Samantha Lee,samantha.lee@email.com,555-135-7902,246 Elm St,,Yourtown,CA,USA,92101,2018-09-22,suspended
4,C005,Michael Chen,michael.chen@email.com,555-864-2319,159 Cedar Ln,,Anothertown,FL,USA,33101,2021-02-28,active
5,C006,Emily Davis,emily.davis@email.com,555-753-1904,753 Maple Dr,,Mytown,TX,USA,75000,2020-07-15,active
6,C007,David Kim,david.kim@email.com,555-370-2648,864 Oak St,,Anothertown,FL,USA,33101,2022-11-05,active
7,C008,Sarah Nguyen,sarah.nguyen@email.com,555-623-7419,951 Pine Rd,,Yourtown,CA,USA,92101,2019-04-01,closed
8,C009,William Garcia,william.garcia@email.com,555-148-5326,258 Elm Ave,,Anytown,CA,USA,90001,2021-08-22,active
9,C010,Jessica Hernandez,jessica.hernandez@email.com,555-963-8520,147 Cedar Blvd,,Someville,NY,USA,10002,2020-03-10,inactive


### Pydough template
The important part about this template is to run the PyDough code and store it in a variable called pydough_output for future comparison.

In [111]:
%%pydough

#Setting up the tables that we will need information from in the context
tables = Customers

#The condition we would like the content to fulfill
filter = Customers

#The information we want to receive in the resulting table
output = filter

#Execute the PyDough code
pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,name,email,phone,address1,address2,city,state,country,postal_code,join_date,status
0,C001,john doe,john.doe@email.com,555-123-4567,123 Main St,,Anytown,CA,USA,90001,2020-01-01,active
1,C002,Jane Smith,jane.smith@email.com,555-987-6543,456 Oak Rd,,Someville,NY,USA,10002,2019-03-15,active
2,C003,Bob Johnson,bob.johnson@email.com,555-246-8135,789 Pine Ave,,Mytown,TX,USA,75000,2022-06-01,inactive
3,C004,Samantha Lee,samantha.lee@email.com,555-135-7902,246 Elm St,,Yourtown,CA,USA,92101,2018-09-22,suspended
4,C005,Michael Chen,michael.chen@email.com,555-864-2319,159 Cedar Ln,,Anothertown,FL,USA,33101,2021-02-28,active
5,C006,Emily Davis,emily.davis@email.com,555-753-1904,753 Maple Dr,,Mytown,TX,USA,75000,2020-07-15,active
6,C007,David Kim,david.kim@email.com,555-370-2648,864 Oak St,,Anothertown,FL,USA,33101,2022-11-05,active
7,C008,Sarah Nguyen,sarah.nguyen@email.com,555-623-7419,951 Pine Rd,,Yourtown,CA,USA,92101,2019-04-01,closed
8,C009,William Garcia,william.garcia@email.com,555-148-5326,258 Elm Ave,,Anytown,CA,USA,90001,2021-08-22,active
9,C010,Jessica Hernandez,jessica.hernandez@email.com,555-963-8520,147 Cedar Blvd,,Someville,NY,USA,10002,2020-03-10,inactive


### Comparison template 
Run this to compare the two data frames you have obtained as a result of the queries

In [None]:
dfcompare.compare_df(pydough_output, sql_output, query_category="a", question="a")

True

SELECT t.sbTxType, COUNT(DISTINCT t.sbTxCustId) AS num_customers, AVG(t.sbTxShares) AS avg_shares FROM sbTransaction AS t WHERE t.sbTxDateTime BETWEEN '2023-01-01' AND '2023-03-31 23:59:59' GROUP BY t.sbTxType ORDER BY CASE WHEN num_customers IS NULL THEN 1 ELSE 0 END DESC, num_customers DESC LIMIT 3;

SELECT c.sbCustId, c.sbCustName FROM sbCustomer AS c LEFT JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxCustId IS NULL;

SELECT DISTINCT c.sbCustId FROM sbCustomer AS c JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxType = 'buy';

SELECT DISTINCT tk.sbTickerId FROM sbTicker AS tk JOIN sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId WHERE dp.sbDpDate >= '2023-04-01';

SELECT tk.sbTickerId, tk.sbTickerSymbol FROM sbTicker AS tk LEFT JOIN sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId WHERE dp.sbDpTickerId IS NULL;

SELECT tk.sbTickerSymbol, COUNT(tx.sbTxId) AS num_transactions, SUM(tx.sbTxAmount) AS total_amount FROM sbTicker AS tk JOIN sbTransaction AS tx ON tk.sbTickerId = tx.sbTxTickerId GROUP BY tk.sbTickerSymbol ORDER BY CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END DESC, total_amount DESC LIMIT 10;

SELECT sbTxStatus, COUNT(*) AS num_transactions FROM sbTransaction GROUP BY sbTxStatus ORDER BY CASE WHEN num_transactions IS NULL THEN 1 ELSE 0 END DESC, num_transactions DESC LIMIT 3;

SELECT c.sbCustState, t.sbTickerType, COUNT(*) AS num_transactions FROM sbTransaction AS tx JOIN sbCustomer AS c ON tx.sbTxCustId = c.sbCustId JOIN sbTicker AS t ON tx.sbTxTickerId = t.sbTickerId GROUP BY c.sbCustState, t.sbTickerType ORDER BY CASE WHEN num_transactions IS NULL THEN 1 ELSE 0 END DESC, num_transactions DESC LIMIT 5;

SELECT sbCustCountry, COUNT(*) AS num_customers FROM sbCustomer GROUP BY sbCustCountry ORDER BY CASE WHEN num_customers IS NULL THEN 1 ELSE 0 END DESC, num_customers DESC LIMIT 5;

SELECT c.sbCustCountry, COUNT(t.sbTxId) AS num_transactions, SUM(t.sbTxAmount) AS total_amount FROM sbCustomer AS c JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId WHERE t.sbTxDateTime >= DATE('now', '-30 days') GROUP BY c.sbCustCountry ORDER BY total_amount DESC LIMIT 5;

# 1.

In [102]:
query = '''
SELECT
    t.sbTxType,
    COUNT(DISTINCT t.sbTxCustId) AS num_customers,
    AVG(t.sbTxShares) AS avg_shares
FROM
    sbTransaction AS t
WHERE
    t.sbTxDateTime BETWEEN '2023-01-01' AND '2023-03-31 23:59:59'
GROUP BY
    t.sbTxType
ORDER BY
    CASE
        WHEN num_customers IS NULL THEN 1
        ELSE 0
    END DESC,
    num_customers DESC
LIMIT 3;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output


Unnamed: 0,sbTxType,num_customers,avg_shares
0,buy,3,41.75
1,sell,3,43.333333


# 2.

In [146]:
query = '''
SELECT
    c.sbCustId,
    c.sbCustName
FROM
    sbCustomer AS c
LEFT JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxCustId IS NULL;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId,sbCustName
0,C011,Alex Rodriguez
1,C020,Maurice Lee


In [150]:
%%pydough

cust_no_trans = Customers.WHERE(HASNOT(transactions_made))

output = cust_no_trans(_id, name)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,name
0,C011,Alex Rodriguez
1,C020,Maurice Lee


In [151]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the customer ID and name of customers who have not made any transactions.")

True

# 3.

In [155]:
query = '''
SELECT DISTINCT
    c.sbCustId
FROM
    sbCustomer AS c
JOIN
    sbTransaction AS t ON c.sbCustId = t.sbTxCustId
WHERE
    t.sbTxType = 'buy';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbCustId
0,C001
1,C003
2,C005
3,C009
4,C002
5,C004
6,C006
7,C008
8,C010
9,C007


In [154]:
%%pydough

tables = Customers.transactions_made

filter = tables.WHERE(transaction_type == 'buy')

output = filter(Cust_Id = BACK(1)._id)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,Cust_Id
0,C001
1,C003
2,C005
3,C003
4,C009
5,C002
6,C004
7,C006
8,C008
9,C010


In [157]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_distinct", question="Return the distinct list of customer IDs who have made a 'buy' transaction.")

np.True_

# 4.

In [158]:
query = '''
SELECT DISTINCT
    tk.sbTickerId
FROM
    sbTicker AS tk
JOIN
    sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId
WHERE
    dp.sbDpDate >= '2023-04-01';
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerId
0,T001
1,T002
2,T003
3,T004
4,T005
5,T006
6,T007
7,T008
8,T009
9,T010


# 5.

In [159]:
query = '''
SELECT
    tk.sbTickerId,
    tk.sbTickerSymbol
FROM
    sbTicker AS tk
LEFT JOIN
    sbDailyPrice AS dp ON tk.sbTickerId = dp.sbDpTickerId
WHERE
    dp.sbDpTickerId IS NULL;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerId,sbTickerSymbol
0,T011,SPY
1,T012,QQQ
2,T013,VTI
3,T014,VXUS
4,T015,VFINX
5,T016,VTSAX
6,T017,VIGAX
7,T018,GOOG


In [162]:
%%pydough

tables = Tickers

filter = tables.WHERE(HASNOT(historical_prices))

output = filter(_id, symbol)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,_id,symbol
0,T011,SPY
1,T012,QQQ
2,T013,VTI
3,T014,VXUS
4,T015,VFINX
5,T016,VTSAX
6,T017,VIGAX
7,T018,GOOG


In [165]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the ticker ID and symbol of tickers that do not have any daily price records.")

True

# 6.

In [211]:
query = '''
SELECT
    tk.sbTickerSymbol,
    COUNT(tx.sbTxId) AS num_transactions,
    SUM(tx.sbTxAmount) AS total_amount
FROM
    sbTicker AS tk
JOIN
    sbTransaction AS tx ON tk.sbTickerId = tx.sbTxTickerId
GROUP BY
    tk.sbTickerSymbol
ORDER BY
    CASE
        WHEN total_amount IS NULL THEN 1
        ELSE 0
    END DESC,
    total_amount DESC
LIMIT 10;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,sbTickerSymbol,num_transactions,total_amount
0,BRK.B,5,1246600.0
1,AMZN,5,138126.0
2,AAPL,9,132293.75
3,MSFT,6,76057.5
4,JPM,4,69791.25
5,FB,6,68780.0
6,GOOGL,5,55190.0
7,TSLA,6,52792.5
8,PG,3,29680.0
9,V,3,25427.5


In [210]:
%%pydough

tables = Tickers.transactions_of(single_amount = amount, transactions  = transaction_id, symbols = BACK(1).symbol)

filter = PARTITION(tables, name="t", by=(symbols))(symbols, num_transactions = COUNT(t.transactions), total_amount = SUM(t.single_amount))

output = filter.TOP_K(10, by = total_amount.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,symbols,num_transactions,total_amount
0,BRK.B,5,1246600.0
1,AMZN,5,138126.0
2,AAPL,9,132293.75
3,MSFT,6,76057.5
4,JPM,4,69791.25
5,FB,6,68780.0
6,GOOGL,5,55190.0
7,TSLA,6,52792.5
8,PG,3,29680.0
9,V,3,25427.5


In [212]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", question="Return the ticker ID and symbol of tickers that do not have any daily price records.")

True

# 7.

In [None]:
query = '''
SELECT
    sbTxStatus,
    COUNT(*) AS num_transactions
FROM
    sbTransaction
GROUP BY
    sbTxStatus
ORDER BY
    CASE
        WHEN num_transactions IS NULL THEN 1
        ELSE 0
    END DESC,
    num_transactions DESC
LIMIT 3;
'''
sql_output = pd.read_sql_query(query, connection)
sql_output

In [None]:
%%pydough

tables = Transactions

filter = PARTITION(tables, name="t", by=(status))(symbols, num_transactions = COUNT(t.transactions), total_amount = SUM(t.single_amount))

output = filter.TOP_K(10, by = total_amount.DESC())

pydough_output = pydough.to_df(output)
pydough_output