In [1]:
%load_ext pydough.jupyter_extensions

In [2]:
import pydough
import datetime

import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import re
import dfcompare

import collections
import numpy as np
import sqlite3 as sql
import os

# Setup demo metadata
pydough.active_session.load_metadata_graph("../../tests/test_metadata/defog_graphs.json", "DermTreatment");
pydough.active_session.connect_database("sqlite", database="../../derm_treat.db");
pydough.active_session.metadata

db_path = "../../derm_treat.db" # variable to pass the path in python function

connection = sql.connect("../../derm_treat.db")

# Avoid scientific notation
pd.options.display.float_format = '{:.6f}'.format


## Query 1

Calculate the average DDD for each drug. Return the drug name and average DDD value.

DDD (defined daily dose) = total drug amount consumed during one treatment / total days of treatment (end - start date in days). To find the average weight of patients treated with a specific drug, first join patients with treatments on patient_id, then filter by the drug name. To identify doctors who have prescribed a certain drug type and their respective locations, first join doctors with treatments on doc_id, then filter by the drug type. To calculate the total number of adverse events reported for treatments involving certain drug types, first join treatments with adverse_events on treatment_id, then filter by the drug type.

SQL:

In [3]:
query = '''
SELECT 
    d.drug_name, 
    AVG(t.tot_drug_amt / NULLIF((JULIANDAY(t.end_dt) - JULIANDAY(t.start_dt)), 0)) AS ddd
FROM 
    treatments AS t
JOIN 
    drugs AS d 
    ON t.drug_id = d.drug_id
WHERE 
    NOT t.end_dt IS NULL
GROUP BY 
    d.drug_name;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,drug_name,ddd
0,Biologic-X,0.44843
1,Drugalin,1.383211
2,Medicol,1.240407
3,Smallazine,0.991803
4,Topicort,3.966972
5,Topizol,1.978142


This query is impossible in PyDough because we do not have a method to manipulate dates, then we can not execute the difference between end date - start date. 

## Query 2

How many distinct patients had treatments in each of the last 3 months, not including the current month? Out of these, how many had treatments with biologic drugs? Return the month, patient count, and biologic treatment count.

Biologic drugs have drug_type = 'biologic'. Truncate start_dt to month for aggregation.

SQL:

In [72]:
query = '''
SELECT 
    strftime('%Y-%m', t.start_dt) AS MONTH, 
    COUNT(DISTINCT t.patient_id) AS patient_count, 
    COUNT(DISTINCT CASE WHEN d.drug_type = 'biologic' THEN t.treatment_id END) AS biologic_treatment_count
FROM 
    treatments AS t
JOIN 
    drugs AS d 
    ON t.drug_id = d.drug_id
WHERE 
    t.start_dt >= date('now', '-3 months', 'start of month')
    AND t.start_dt < date('now', 'start of month')
GROUP BY 
    MONTH;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,MONTH,patient_count,biologic_treatment_count
0,2024-11,1,1
1,2024-12,1,0
2,2025-01,1,0


In Python we do not have a method to take the current date, then all the calculations will be use "2025-02" as month and year of reference.

PyDough:

In [76]:
%%pydough

# In Python we do not have a method to take the current date, then all the calculations will be use "2025-02" as month and year of reference.
selected_treatments = Treatments(month=JOIN_STRINGS("-", YEAR(start_dt), MONTH(start_dt)), biologic_treatment_count=IFF(drug.drug_type == "biologic", 1, 0)).WHERE(
    (start_dt >= "2024-11-01") & (start_dt <= "2025-01-31")
    )

output = PARTITION(selected_treatments, name="part", by=(month, biologic_treatment_count))(
    MONTH=month,
    patient_count=NDISTINCT(part.patient.patient_id),
    biologic_treatment_count=biologic_treatment_count
).ORDER_BY(month.ASC())

pydough_output= pydough.to_df(output)
pydough_output

Unnamed: 0,MONTH,patient_count,biologic_treatment_count
0,2024-11,1,1
1,2024-12,1,0
2,2025-1,1,0


Now we are going to compare the result in SQL and PyDough:

In [77]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_date_join", 
                     question="How many distinct patients had treatments in each of the last 3 months, not including the current month? Out of these, how many had treatments with biologic drugs? Return the month, patient count, and biologic treatment count.")

np.False_

We believe the result is “False” because there is a difference in how SQL and PyDough return dates. For example, the date “2025-01” in SQL is not the same as “2025-1” in PyDough, we lost the 0. 

## Query 3

How many patients have a Gmail or Yahoo email address?

SQL:

In [44]:
query = '''
SELECT COUNT(*) 
FROM patients 
WHERE email 
    LIKE '%@gmail.com' OR email LIKE '%@yahoo.com';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(*)
0,0


PyDough:

In [35]:
%%pydough

output = DermTreatment(COUNT=COUNT(Patients.WHERE(ENDSWITH(email, "@gmail.com") | ENDSWITH(email, "@yahoo.com"))))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,COUNT
0,0


**Important Note:** In this case, we are not getting any results because the patients registered in the DermTreatment database do not have a gmail or yahoo email. You can change in the SQL and PyDough queries one of the emails to the word “@email.com” and you will get results. 

Now we are going to compare the results in SQL and PyDough:

In [101]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_string_matching", 
                     question="How many patients have a Gmail or Yahoo email address?")

True

## Query 4

How many patients have been diagnosed with 'Psoriasis vulgaris' and treated with a biologic drug? Return the distinct count of patients.

To find the number of patients who have been diagnosed with a specific type of psoriasis and treated with a biologic drug, first join patients with treatments on patient_id, then join with diagnoses on diag_id, filtering by diagnosis and drug type.

SQL:

In [133]:
# Test sql query in python
query = '''
WITH patient_diagnosis_treatment AS (
    SELECT 
        p.patient_id
    FROM 
        patients AS p
    JOIN 
        treatments AS t 
        ON p.patient_id = t.patient_id
    JOIN 
        diagnoses AS d 
        ON t.diag_id = d.diag_id
    JOIN 
        drugs AS dr 
        ON t.drug_id = dr.drug_id
    WHERE 
        d.diag_name = 'Psoriasis vulgaris' 
        AND dr.drug_type = 'biologic'
)
SELECT 
    COUNT(DISTINCT patient_id)
FROM 
    patient_diagnosis_treatment;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(DISTINCT patient_id)
0,3


PyDough:

In [129]:
%%pydough

patients = Treatments.WHERE(LIKE(diagnosis.diag_name, "Psoriasis vulgaris") & LIKE(drug.drug_type, "biologic"))

output = DermTreatment(COUNT=NDISTINCT(patients.patient_id))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,COUNT
0,3


Now we are going to compare the results in SQL and PyDough:

In [134]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", 
                     question="How many patients have been diagnosed with 'Psoriasis vulgaris' and treated with a biologic drug? Return the distinct count of patients.")

True

## Query 5

How many treatments did the patient Alice have in the last 6 months, not including the current month?

Last 6 months = DATE('now', 'start of month', '-6 months') to DATE('now', 'start of month', '-1 day'). Always join treatments with patients before using the treatments table. If not mentioned, the patient name provided is the first name.

SQL:

In [32]:
query = '''
SELECT 
    COUNT(t.treatment_id)
FROM 
    treatments AS t
JOIN 
    patients AS p 
    ON t.patient_id = p.patient_id
WHERE 
    p.first_name = 'Alice' 
    AND t.start_dt BETWEEN 
        date('now', 'start of month', '-6 months') 
        AND date('now', 'start of month', '-1 day');
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(t.treatment_id)
0,2


In PyDough we do not have a method "now" and can not calculate the current date. Then we are simulate, and "now" will be the date of "2025-02-10", and dates should be between "2024-08-01 and "2025-01-31".

PyDough:

In [33]:
%%pydough

# In PyDough we do not have a method "now" and can not calculate the current date. Then we are simulate, and "now" will be the date of "2025-02-10", 
# and dates should be between "2024-08-01 and "2025-01-31". 

treatments = Treatments.WHERE(
    (start_dt >= "2024-08-01") & (start_dt <= "2025-01-31") & LIKE(patient.first_name, "Alice"))

output = DermTreatment(COUNT=COUNT(treatments.treatment_id))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,COUNT
0,2


Now we are going to compare the results in SQL and PyDough:

In [34]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_date_join", 
                     question="How many treatments did the patient Alice have in the last 6 months, not including the current month?")

True

## Query 6 

I want the adverse events that have been reported for treatments involving topical drugs. Give me the description, treatment id, drug id and name.

To get adverse events reported for treatments involving certain drugs, first join treatments with adverse_events on treatment_id, then join with drugs on drug_id to filter on the specific drug(s).

SQL:

In [93]:
query = '''
SELECT 
    a.description, 
    a.treatment_id, 
    d.drug_id, 
    d.drug_name
FROM 
    adverse_events AS a
JOIN 
    treatments AS t 
    ON a.treatment_id = t.treatment_id
JOIN 
    drugs AS d 
    ON t.drug_id = d.drug_id
WHERE 
    d.drug_type = 'topical';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,description,treatment_id,drug_id,drug_name
0,Diarrhea,9,3,Topizol


PyDough:

In [92]:
%%pydough

output = AdverseEvents.treatment.WHERE(
    LIKE(drug.drug_type, "topical")
)(BACK(1).description, treatment_id, drug.drug_id, drug.drug_name)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,description,treatment_id,drug_id,drug_name
0,Diarrhea,9,3,Topizol


Now we are going to compare the results in SQL and PyDough:

In [94]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", 
                     question="I want the adverse events that have been reported for treatments involving topical drugs. Give me the description, treatment id, drug id and name.")

True

## Query 7

Return each doctor's doc_id, specialty, number of distinct drugs prescribed, and SDR

SDR = a doctor's rank within their specialty by number of distinct drugs prescribed. Doctors prescribing more drugs will have a higher rank.

SQL:

In [35]:
query = '''
WITH doc_drug_counts AS (
    SELECT 
        d.doc_id, 
        d.specialty, 
        COUNT(DISTINCT t.drug_id) AS num_drugs_prescribed
    FROM 
        doctors AS d
    JOIN 
        treatments AS t 
        ON d.doc_id = t.doc_id
    GROUP BY 
        d.doc_id
)
SELECT 
    doc_id, 
    specialty, 
    num_drugs_prescribed, 
    DENSE_RANK() OVER (
        PARTITION BY specialty 
        ORDER BY 
            CASE 
                WHEN num_drugs_prescribed IS NULL THEN 1 
                ELSE 0 
            END DESC, 
            num_drugs_prescribed DESC
    ) AS specialty_drug_rank
FROM 
    doc_drug_counts;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,doc_id,specialty,num_drugs_prescribed,specialty_drug_rank
0,1,dermatology,4,1
1,4,dermatology,3,2
2,7,dermatology,3,2
3,3,general,3,1
4,2,immunology,4,1
5,5,immunology,4,1
6,6,oncology,3,1


PyDough:

In [None]:
%%pydough
# WROKING HERE, THE SOLUTION HAS NOT BEEN IMPLEMENTED YET

treatments = Treatments(doc=doctor.doc_id, specialty=doctor.speciality)

partition = PARTITION(treatments, name="part", by=specialty)(
    specialty=specialty,
    num_drugs_prescribed=NDISTINCT(part.drug.drug_id),
)

pydough_output = pydough.to_df(partition)
pydough_output

Unnamed: 0,specialty,num_drugs_prescribed
0,dermatology,5
1,general,3
2,immunology,6
3,oncology,3


## Query 8

Return the first name, last name and specialty of doctors whose first name starts with 'J' or last name contains 'son', case-insensitive.

Doctors whose first name starts with 'J' or last name contains 'son', case-insensitive

SQL:

In [23]:
query = '''
SELECT 
    first_name, 
    last_name, 
    specialty
FROM 
    doctors
WHERE 
    LOWER(first_name) LIKE 'j%' 
    OR LOWER(last_name) LIKE '%son%';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,first_name,last_name,specialty
0,John,Doe,dermatology
1,Jane,Smith,immunology
2,David,Johnson,general
3,Sarah,Wilson,oncology
4,Olivia,Anderson,dermatology


PyDough:

In [29]:
%%pydough
output = Doctors(first_name, last_name, specialty=speciality).WHERE(
    STARTSWITH(first_name, "j") | LIKE(last_name, "%son%")
)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,first_name,last_name,specialty
0,John,Doe,dermatology
1,Jane,Smith,immunology
2,David,Johnson,general
3,Sarah,Wilson,oncology
4,Olivia,Anderson,dermatology


Now we are to compare the results in SQL and PyDough:

In [30]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_string_matching", 
                     question="Return the first name, last name and specialty of doctors whose first name starts with 'J' or last name contains 'son', case-insensitive.")

True

## Query 9

What are the PMPD and PMTC for each of the last 12 months, not including the current month.

PMPD = per month patient diagnoses. PMTC = per month treatment count. Truncate start_dt to month for aggregation.

SQL:

In [31]:
query = '''
SELECT 
    strftime('%Y-%m', t.start_dt) AS month, 
    COUNT(DISTINCT t.patient_id) AS patient_count, 
    COUNT(DISTINCT t.treatment_id) AS treatment_count
FROM 
    treatments AS t
JOIN 
    diagnoses AS d 
    ON t.diag_id = d.diag_id
WHERE 
    t.start_dt >= date('now', '-12 months', 'start of month') 
    AND t.start_dt < date('now', 'start of month')
GROUP BY 
    month;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,month,patient_count,treatment_count
0,2024-02,1,1
1,2024-08,1,1
2,2024-09,2,2
3,2024-10,1,1
4,2024-11,1,1
5,2024-12,1,1
6,2025-01,1,1


As we can not use dates functions in PyDough to take the current date and make calculations bases on it, we are going to simulate and "now" will be the date: "2025-02-11". And "-12 months" will be: "2024-02-01".

PyDough:

In [37]:
%%pydough
# Since we cannot use date functions in PyDough to take the current date and do calculations based on it, 
# we will simulate and “now” will be the date: “2025-02-11” and not including the current month will be: "2025-01-31". And “-12 months” will be “2024-02-01”.
treatments = Treatments(month=JOIN_STRINGS("-", YEAR(start_dt), MONTH(start_dt))).WHERE(
    (start_dt >= "2024-02-01") & (start_dt <= "2025-01-31")
)

output = PARTITION(treatments, name="part", by=month)(
    month=month,
    patient_count=NDISTINCT(part.patient_id),
    treatment_count=NDISTINCT(part.treatment_id)
)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,month,patient_count,treatment_count
0,2024-10,1,1
1,2024-11,1,1
2,2024-12,1,1
3,2024-2,1,1
4,2024-8,1,1
5,2024-9,2,2
6,2025-1,1,1


Now we are goint to compare the results in SQL and PyDough:

In [38]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_date_join", 
                     question="What are the PMPD and PMTC for each of the last 12 months, not including the current month")

np.False_

We believe the result is “False” because there is a difference in how SQL and PyDough return dates. For example, the date “2025-01” in SQL is not the same as “2025-1” in PyDough, we lost the 0. Also the rows order is not the same. 

## Query 10

What is the average weight in kg of patients treated with the drug named 'Drugalin'? Return the average weight.

To find the average weight of patients treated with a specific drug, first join patients with treatments on patient_id, then filter by the drug name.

SQL:

In [85]:
query = '''
WITH patient_treatment AS (
    SELECT 
        p.patient_id, 
        p.weight_kg
    FROM 
        patients AS p
    JOIN 
        treatments AS t 
        ON p.patient_id = t.patient_id
    WHERE 
        t.drug_id = (
            SELECT 
                drug_id 
            FROM 
                drugs 
            WHERE 
                drug_name = 'Drugalin'
        )
)
SELECT 
    AVG(weight_kg)
FROM 
    patient_treatment;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,AVG(weight_kg)
0,73.333333


PyDough:

In [86]:
%%pydough
patients = Treatments(patient.weight_kg).WHERE(LIKE(drug.drug_name, "Drugalin"))

output = DermTreatment(avg = AVG(patients.weight_kg))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,avg
0,73.333333


Now we are going to compare the results in SQL and PyDough:

In [87]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", 
                     question="What is the average weight in kg of patients treated with the drug named 'Drugalin'? Return the average weight.")

True

## Query 11

What is the CAW for male patients.

CAW = cohort average weight in kilograms

SQL:

In [57]:
query = '''
SELECT 
    AVG(weight_kg) AS caw
FROM 
    patients
WHERE 
    gender = 'Male';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,caw
0,83.0


PyDough:

In [60]:
%%pydough
patients = Patients(weight_kg).WHERE(LIKE(gender, "Male"))

output = DermTreatment(caw = AVG(patients.weight_kg))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,caw
0,83.0


Now we are going to compare the results in SQL and PyDough:

In [64]:
dfcompare.compare_df(pydough_output, sql_output, query_category="keywords_aggregate", 
                     question="What is the CAW for male patients")

True

## Query 12

What is the NPI for each year? Return the year, number of new patients, and NPI.

NPI (new patients increase) = the increase in number of new patients compared to the previous year. New patients are defined as patients starting their first treatment and require joining the patients table with the earliest record of each patient from the treatment table on patient_id

SQL:

In [65]:
query = '''
WITH FirstTreatment AS (
    SELECT 
        p.patient_id, 
        MIN(t.start_dt) AS first_treatment_date
    FROM 
        patients AS p
    JOIN 
        treatments AS t 
        ON p.patient_id = t.patient_id
    GROUP BY 
        p.patient_id
),
NewPatientsPerYear AS (
    SELECT 
        strftime('%Y', first_treatment_date) AS year, 
        COUNT(patient_id) AS new_patients
    FROM 
        FirstTreatment
    GROUP BY 
        strftime('%Y', first_treatment_date)
),
NPI AS (
    SELECT 
        year, 
        new_patients, 
        new_patients - LAG(new_patients, 1) OVER (ORDER BY year) AS npi
    FROM 
        NewPatientsPerYear
)
SELECT 
    year, 
    new_patients, 
    npi
FROM 
    NPI
ORDER BY 
    year;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,year,new_patients,npi
0,2022,6,
1,2023,1,-5.0
2,2024,2,1.0


We need the implementation of Next / Prev in PyDough to solve this problem. 

## Query 13

What is the overall D7D100PIR across all treatments? Return the percentage value.

D7D100PIR (day 7 to day 100 PASI improvement rate) = (avg PASI score on day 100 - avg PASI score on day 7) / avg PASI score on day 7 * 100. This should only include patients who have non-null PASI scores for both timepoints.

SQL:

In [88]:
query = '''
SELECT 
    (AVG(day100_pasi_score) - AVG(day7_pasi_score)) / AVG(day7_pasi_score) * 100 AS d7d100pir
FROM 
    outcomes
WHERE 
    NOT day7_pasi_score IS NULL 
    AND NOT day100_pasi_score IS NULL;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,d7d100pir
0,-74.150439


PyDough:

In [92]:
%%pydough
outcomes = Outcomes(day100_pasi_score, day7_pasi_score, day7_pasi_score).WHERE(
    PRESENT(day7_pasi_score) & PRESENT(day100_pasi_score)
)

output = DermTreatment(d7d100pir=((AVG(outcomes.day100_pasi_score) - AVG(outcomes.day7_pasi_score)) / AVG(outcomes.day7_pasi_score) * 100))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,d7d100pir
0,-74.150439


Now we are going to compare results in SQL and PyDough:

In [93]:
dfcompare.compare_df(pydough_output, sql_output, query_category="keywords_ratio", 
                     question="What is the overall D7D100PIR across all treatments? Return the percentage value.")

True

## Query 14

What is the PIC for female patients?

PIC = private insured patient count

SQL:

In [98]:
query = '''
SELECT 
    COUNT(patient_id) AS pic
FROM 
    patients
WHERE 
    gender = 'Female' 
    AND ins_type = 'private';
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,pic
0,3


PyDough:

In [99]:
%%pydough

patients = Patients(patient_id).WHERE(LIKE(gender, "female") & LIKE(ins_type, "private"))

output = DermTreatment(pic=COUNT(patients.patient_id))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,pic
0,3


Now we are going to compare the results in SQL and PyDough:

In [100]:
dfcompare.compare_df(pydough_output, sql_output, query_category="keywords_aggregate", 
                     question="What is the PIC for female patients?")

True

## Query 15

Which drug had the highest number of adverse events reported within the same month as the treatment start date (adverse event or treatment can be earlier than the other)? Return the number of adverse events along with the drug's id and name.

If events from 2 different tables from the same interval are to be joined, join on the respective truncated date fields eg `FROM t1 JOIN t2 ON DATE(t1.date, '<interval>') = DATE(t2.date, '<interval>').

SQL:

In [101]:
query = '''
WITH adverse_events_per_drug AS (
    SELECT 
        d.drug_id, 
        COUNT(ae.id) AS num_events
    FROM 
        adverse_events AS ae
    JOIN 
        treatments AS t 
        ON ae.treatment_id = t.treatment_id 
        AND strftime('%Y-%m', ae.reported_dt) = strftime('%Y-%m', t.start_dt)
    JOIN 
        drugs AS d 
        ON t.drug_id = d.drug_id
    GROUP BY 
        d.drug_id
)
SELECT 
    ae.drug_id, 
    d.drug_name, 
    ae.num_events
FROM 
    adverse_events_per_drug AS ae
JOIN 
    drugs AS d 
    USING (drug_id)
ORDER BY 
    ae.num_events DESC
LIMIT 1;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,drug_id,drug_name,num_events
0,1,Drugalin,2


PyDough:

In [128]:
%%pydough
output = PARTITION(Drugs, name="part", by=(drug_id, drug_name))(
    drug_id,
    drug_name,
    num_events=COUNT(part.treatments_used_in.adverse_events.reported_dt)
).TOP_K(1, by=num_events.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,drug_id,drug_name,num_events
0,1,Drugalin,2


Now we are going to compare the results in SQL and PyDough:

In [130]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_date_join", 
                     question="Which drug had the highest number of adverse events reported within the same month as the treatment start date (adverse event or treatment can be earlier than the other)? Return the number of adverse events along with the drug's id and name.")

True

## Query 16

Which states do doctors who have prescribed biologic drugs reside in? Return the distinct states.

To identify doctors who have prescribed a certain drug type and their respective states, first join doctors with treatments on doc_id, then filter by the drug type.

SQL:

In [140]:
query = '''
WITH doctor_treatment AS (
    SELECT 
        d.doc_id, 
        d.loc_state
    FROM 
        doctors AS d
    JOIN 
        treatments AS t 
        ON d.doc_id = t.doc_id
    JOIN 
        drugs AS dr 
        ON t.drug_id = dr.drug_id
    WHERE 
        dr.drug_type = 'biologic'
)
SELECT 
    DISTINCT loc_state
FROM 
    doctor_treatment;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,loc_state
0,MD
1,CA
2,CT
3,PA
4,MA
5,NC


PyDough:

In [141]:
%%pydough

doctors = Treatments(doctor.doc_id, doctor.loc_state).WHERE(LIKE(drug.drug_type, "biologic"))

output = PARTITION(doctors, name="part", by=loc_state)(
    loc_state
)

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,loc_state
0,CA
1,CT
2,MA
3,MD
4,NC
5,PA


Now we are going to compare the results in SQL and PyDough:

In [142]:
dfcompare.compare_df(pydough_output, sql_output, query_category="instructions_cte_join", 
                     question="Which states do doctors who have prescribed biologic drugs reside in? Return the distinct states.")

np.True_

## Query 17

For treatments that ended in the year 2022 (from Jan 1st to Dec 31st inclusive), what is the average PASI score at day 100 and number of distinct patients per insurance type? Return the top 5 insurance types sorted by lowest average PASI score first.

Note: This query will not return 5, because there is no five types insurance, only 3. 

SQL:

In [97]:
# Test sql query in python
query = '''
SELECT 
    p.ins_type, 
    COUNT(DISTINCT t.patient_id) AS num_patients, 
    AVG(o.day100_pasi_score) AS avg_pasi_score
FROM 
    treatments AS t
JOIN 
    patients AS p 
    ON t.patient_id = p.patient_id
JOIN 
    outcomes AS o 
    ON t.treatment_id = o.treatment_id
WHERE 
    t.end_dt BETWEEN '2022-01-01' AND '2022-12-31'
GROUP BY 
    p.ins_type
ORDER BY 
    CASE 
        WHEN avg_pasi_score IS NULL THEN 1 
        ELSE 0 
    END, 
    avg_pasi_score
LIMIT 5;
'''

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,ins_type,num_patients,avg_pasi_score
0,private,3,2.275
1,uninsured,1,2.6
2,medicare,1,3.5


In [96]:
%%pydough

selected_treatments = Treatments(ins_type=patient.ins_type).WHERE((end_dt >= "2022-01-01") & (end_dt <= "2022-12-31"))

output = PARTITION(selected_treatments, name="treatment", by=ins_type)(
    ins_type=ins_type,
    num_patients= NDISTINCT(treatment.patient_id),
    avg_pasi_score= AVG(treatment.outcome_records.day100_pasi_score)
).TOP_K(5, by=avg_pasi_score.ASC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,ins_type,num_patients,avg_pasi_score
0,private,3,2.275
1,uninsured,1,2.6
2,medicare,1,3.5


Now we are going to compare the results in SQL and PyDough:

In [98]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_date_group_order_limit", 
                     question="For treatments that ended in the year 2022 (from Jan 1st to Dec 31st inclusive), what is the average PASI score at day 100 and number of distinct patients per insurance type? Return the top 5 insurance types sorted by lowest average PASI score first.")

True

## Query 18

Return the distinct list of doctor IDs, first names and last names that have prescribed treatments.

SQL:

In [100]:
query = """
SELECT DISTINCT d.doc_id, d.first_name, d.last_name 
FROM treatments AS t 
JOIN doctors AS d ON t.doc_id = d.doc_id;
"""

sql_output = pd.read_sql_query(query, connection)

sql_output

Unnamed: 0,doc_id,first_name,last_name
0,1,John,Doe
1,2,Jane,Smith
2,3,David,Johnson
3,4,Emily,Brown
4,5,Michael,Davis
5,6,Sarah,Wilson
6,7,Robert,Taylor


Query in PyDough:

In [101]:
%%pydough

output = Doctors(doc_id, first_name, last_name).WHERE(HAS(prescribed_treatments))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,doc_id,first_name,last_name
0,1,John,Doe
1,2,Jane,Smith
2,3,David,Johnson
3,4,Emily,Brown
4,5,Michael,Davis
5,6,Sarah,Wilson
6,7,Robert,Taylor


Now, we are going to compare the results in SQL and PyDough:

In [102]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", 
                     question="Return the customer ID and name of customers who have not made any transactions.")

True

## Query 19
Return the distinct list of patient IDs, first names and last names that have outcome assessments.

SQL:

In [103]:
query = """
SELECT DISTINCT p.patient_id, p.first_name, p.last_name 
FROM outcomes AS o 
JOIN treatments AS t ON o.treatment_id = t.treatment_id 
JOIN patients AS p ON t.patient_id = p.patient_id;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,patient_id,first_name,last_name
0,1,Alice,Johnson
1,2,Bob,Smith
2,3,Carol,Davis
3,4,David,Wilson
4,5,Eve,Brown
5,6,Frank,Taylor
6,7,Grace,Anderson
7,9,Isaac,Martinez
8,10,John,Richter


PyDough:

In [105]:
%%pydough

output = Patients(patient_id, first_name, last_name).WHERE(HAS(treatments_received.WHERE(HAS(outcome_records))))

pydough_output = pydough.to_df(output)
pydough_output


Unnamed: 0,patient_id,first_name,last_name
0,1,Alice,Johnson
1,2,Bob,Smith
2,3,Carol,Davis
3,4,David,Wilson
4,5,Eve,Brown
5,6,Frank,Taylor
6,7,Grace,Anderson
7,9,Isaac,Martinez
8,10,John,Richter


Now we are going to compare the results in SQL and PyDough:

In [106]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_distinct", 
                     question="Return the distinct list of patient IDs, first names and last names that have outcome assessments.")

True

## Query 20

Return the drug IDs and names of drugs that have not been used in any treatments.

SQL:

In [107]:
query = """
SELECT d.drug_id, d.drug_name 
FROM drugs AS d 
LEFT JOIN treatments AS t ON d.drug_id = t.drug_id 
WHERE t.drug_id IS NULL;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,drug_id,drug_name
0,7,Biologic-Y
1,8,Smallitol
2,9,Topicalin
3,10,Biologic-Z


PyDoudh:

In [111]:
%%pydough

output = Drugs(drug_id, drug_name).WHERE(HASNOT(treatments_used_in))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,drug_id,drug_name
0,7,Biologic-Y
1,8,Smallitol
2,9,Topicalin
3,10,Biologic-Z


Now we are going to compare the results in SQL and PyDough:

In [110]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", 
                     question="Return the drug IDs and names of drugs that have not been used in any treatments.")

True

## Query 21

Return the patient IDs, first names and last names of patients who have not received any treatments.

SQL:

In [112]:
query = """
SELECT p.patient_id, p.first_name, p.last_name 
FROM patients AS p 
LEFT JOIN treatments AS t ON p.patient_id = t.patient_id 
WHERE t.patient_id IS NULL;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,patient_id,first_name,last_name
0,8,Hannah,Garcia
1,11,Kelly,Smith


PyDough:

In [116]:
%%pydough
output = Patients(patient_id, first_name, last_name).WHERE(HASNOT(treatments_received))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,patient_id,first_name,last_name
0,8,Hannah,Garcia
1,11,Kelly,Smith


Now we are going to compare the results in SQL and PyDough:

In [115]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_left_join", 
                     question="Return the patient IDs, first names and last names of patients who have not received any treatments.")

True

## Query 22

What are the top 2 specialties by number of doctors? Return the specialty and number of doctors.

SQL:

In [118]:
query = """
SELECT 
    specialty, 
    COUNT(*) AS num_doctors 
FROM 
    doctors 
GROUP BY 
    specialty 
ORDER BY 
    CASE 
        WHEN COUNT(*) IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    num_doctors DESC 
LIMIT 2;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output


Unnamed: 0,specialty,num_doctors
0,dermatology,4
1,immunology,3


PyDough:

In [121]:
%%pydough

output = PARTITION(Doctors, name="part", by=speciality)(
    specialty=speciality,
    num_doctors=COUNT(part.doc_id)
).TOP_K(2, by=num_doctors.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,specialty,num_doctors
0,dermatology,4
1,immunology,3


Now we are going to compare the results in SQL and PyDough:

In [120]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_group_order_limit", 
                     question="What are the top 2 specialties by number of doctors? Return the specialty and number of doctors.")

True

## Query 23

What are the top 3 diagnoses by maximum itch VAS score at day 100 and number of distinct patients? Return the diagnosis name, number of patients, and maximum itch score.

SQL:

In [122]:
query = """
SELECT 
    di.diag_name, 
    COUNT(DISTINCT t.patient_id) AS num_patients, 
    MAX(o.day100_itch_vas) AS max_itch_score
FROM 
    treatments AS t
JOIN 
    diagnoses AS di 
    ON t.diag_id = di.diag_id
JOIN 
    outcomes AS o 
    ON t.treatment_id = o.treatment_id
GROUP BY 
    di.diag_name
ORDER BY 
    CASE 
        WHEN max_itch_score IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    max_itch_score DESC, 
    CASE 
        WHEN num_patients IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    num_patients DESC
LIMIT 3;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,diag_name,num_patients,max_itch_score
0,Other psoriatic arthropathy,1,70
1,Psoriasis vulgaris,3,40
2,Other psoriasis,2,40


PyDough:

In [124]:
%%pydough

selected_lines = Treatments(diag_name=diagnosis.diag_name).WHERE(HAS(outcome_records))

output = PARTITION(selected_lines, name="partit", by=diag_name)(
    diag_name=diag_name,
    num_patients=NDISTINCT(partit.patient_id),
    max_itch_score=MAX(partit.outcome_records.day100_itch_vas)
).TOP_K(3, (max_itch_score.DESC(), num_patients.DESC()))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,diag_name,num_patients,max_itch_score
0,Other psoriatic arthropathy,1,70
1,Psoriasis vulgaris,3,40
2,Other psoriasis,2,40


Now we are going to compare the results in SQL and PyDough:

In [125]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_group_order_limit", 
                     question="What are the top 3 diagnoses by maximum itch VAS score at day 100 and number of distinct patients? Return the diagnosis name, number of patients, and maximum itch score.")

True

## Query 24

What are the top 3 doctor specialties by total drug amount prescribed for treatments started in the past 6 calendar months? Return the specialty, number of treatments, and total drug amount.

SQL:

In [129]:
query = """
SELECT 
    d.specialty, 
    COUNT(*) AS num_treatments, 
    SUM(t.tot_drug_amt) AS total_drug_amt 
FROM 
    treatments AS t 
JOIN 
    doctors AS d 
ON 
    t.doc_id = d.doc_id 
WHERE 
    t.start_dt >= DATE('now', '-6 months') 
GROUP BY 
    d.specialty 
ORDER BY 
    total_drug_amt DESC 
LIMIT 3;

"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,specialty,num_treatments,total_drug_amt
0,immunology,3,580.0
1,dermatology,3,450.0
2,general,1,300.0


In PyDough we do not have a method DATE('now', '-6 months') to return the date. Then we take 2025-02-07 (current day when it was made) as "now" to calculate the date if we go 6 months in the past. This would be: "2024-08-07"

PyDough:

In [128]:
%%pydough
# In PyDough we do not have a method DATE('now', '-6 months') to return the date. 
# Then we take 2025-02-07 (current day when it was made) as "now" to calculate the date if we go 6 months in the past. 
# This would be: "2024-08-07"

selected_treatments = Treatments(speciality=doctor.speciality).WHERE(start_dt >= "2024-08-07")

output = PARTITION(selected_treatments, name="part", by=speciality)(
    speciality=speciality,
    num_treatments= COUNT(part),
    total_drug_amt=SUM(part.tot_drug_amt)
).TOP_K(3, by=total_drug_amt.DESC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,speciality,num_treatments,total_drug_amt
0,immunology,3,580.0
1,dermatology,3,450.0
2,general,1,300.0


Now we are going to compare the results in SQL and PyDough:

In [130]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_date_group_order_limit", 
                     question="What are the top 3 doctor specialties by total drug amount prescribed for treatments started in the past 6 calendar months? Return the specialty, number of treatments, and total drug amount.")

True

## Query 25

What are the top 3 insurance types by average patient height in cm? Return the insurance type, average height and average weight.

SQL:

In [131]:
query = """
SELECT 
    ins_type, 
    AVG(height_cm) AS avg_height, 
    AVG(weight_kg) AS avg_weight 
FROM 
    patients 
GROUP BY 
    ins_type 
ORDER BY 
    CASE 
        WHEN AVG(height_cm) IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    avg_height DESC 
LIMIT 3;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,ins_type,avg_height,avg_weight
0,medicaid,176.5,80.0
1,medicare,171.666667,71.0
2,private,169.6,67.8


PyDough:

In [133]:
%%pydough
output = PARTITION(Patients, name="par", by=ins_type)(
    ins_type,
    avg_height=AVG(par.height_cm),
    avg_weight=AVG(par.weight_kg)
).TOP_K(3, by=ins_type.ASC())

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,ins_type,avg_height,avg_weight
0,medicaid,176.5,80.0
1,medicare,171.666667,71.0
2,private,169.6,67.8


Now we are going to compare the results in SQL and PyDough:

In [77]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_group_order_limit", 
                     question="What are the top 3 insurance types by average patient height in cm? Return the insurance type, average height and average weight.)")

True

## Query 26

What are the top 5 drugs by number of treatments and average drug amount per treatment? Return the drug name, number of treatments, and average drug amount.

SQL:

In [78]:
query = """
SELECT 
    d.drug_name, 
    COUNT(*) AS num_treatments, 
    AVG(t.tot_drug_amt) AS avg_drug_amt
FROM 
    treatments AS t
JOIN 
    drugs AS d 
    ON t.drug_id = d.drug_id
GROUP BY 
    d.drug_name
ORDER BY 
    CASE 
        WHEN num_treatments IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    num_treatments DESC, 
    CASE 
        WHEN avg_drug_amt IS NULL THEN 1 
        ELSE 0 
    END DESC, 
    avg_drug_amt DESC
LIMIT 5;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,drug_name,num_treatments,avg_drug_amt
0,Drugalin,6,206.666667
1,Medicol,6,178.333333
2,Topizol,4,307.5
3,Biologic-X,4,150.0
4,Topicort,3,580.0


PyDpugh:

In [134]:
%%pydough

selected_lines = Treatments(drug_name=drug.drug_name).WHERE(HAS(drug))

output = PARTITION(selected_lines, name="part", by=drug_name)(
    drug_name=drug_name,
    num_treatments=COUNT(part.treatment_id),
    tot_drug_amt=AVG(part.tot_drug_amt)
).TOP_K(5, (num_treatments.DESC(), tot_drug_amt.DESC()))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,drug_name,num_treatments,tot_drug_amt
0,Drugalin,6,206.666667
1,Medicol,6,178.333333
2,Topizol,4,307.5
3,Biologic-X,4,150.0
4,Topicort,3,580.0


Now we are going to compare the results in SQL and PyDough:

In [84]:
dfcompare.compare_df(pydough_output, sql_output, query_category="basic_join_group_order_limit", 
                     question="What are the top 5 drugs by number of treatments and average drug amount per treatment? Return the drug name, number of treatments, and average drug amount.")

True

## Query 27

How many treatments for diagnoses containing 'psoriasis' (match with wildcards case-insensitively) involve drugs that have been FDA-approved and the treatments have ended within the last 6 months from today?

SQL:

In [48]:
query = """
SELECT 
    COUNT(*)
FROM 
    treatments AS t
JOIN 
    diagnoses AS d 
    ON t.diag_id = d.diag_id
JOIN 
    drugs AS dr 
    ON t.drug_id = dr.drug_id
WHERE 
    d.diag_name LIKE '%psoriasis%'
    AND dr.fda_appr_dt IS NOT NULL
    AND t.end_dt >= DATE('now', '-6 months');
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,COUNT(*)
0,2


In PyDough we do not have a method DATE('now', '-6 months') to return the date. Then we take 2025-02-07 (current day when it was made) as "now" to calculate the date if we go 6 months in the past. This would be: "2024-08-07"

PyDough:

In [51]:
%%pydough
# In PyDough we do not have a method DATE('now', '-6 months') to return the date. 
# Then we take 2025-02-07 (current day when it was made) as "now" to calculate the date if we go 6 months in the past. 
# This would be: "2024-08-07"

treatments = Treatments.WHERE(
    (end_dt >= "2024-08-07") 
    & LIKE(diagnosis.diag_name,'%psoriasis%')
    & PRESENT(drug.fda_appr_dt)
)

output = DermTreatment(COUNT=COUNT(treatments))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,COUNT
0,2


Now we are going to compare the results in SQL and PyDough:

In [52]:
dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", 
                     question="How many treatments for diagnoses containing 'psoriasis' (match with wildcards case-insensitively) involve drugs that have been FDA-approved and the treatments have ended within the last 6 months from today?")

True

## Query 28

List the last name, year of registration, and first treatment (date and id) by doctors who were registered 2 years ago.

SQL:

In [63]:
query = """
WITH doc_first_treatment AS (
    SELECT 
        d.doc_id, 
        d.last_name, 
        d.year_reg, 
        t.treatment_id, 
        t.start_dt, 
        ROW_NUMBER() OVER (
            PARTITION BY d.doc_id 
            ORDER BY t.start_dt ASC
        ) AS rn
    FROM 
        doctors AS d
    JOIN 
        treatments AS t 
        ON d.doc_id = t.doc_id
    WHERE 
        d.year_reg = strftime('%Y', 'now', '-2 years')
)
SELECT 
    last_name, 
    year_reg, 
    start_dt AS first_treatment_date, 
    treatment_id AS first_treatment_id
FROM 
    doc_first_treatment
WHERE 
    rn = 1;
"""

df = pd.read_sql_query(query, connection)

df

Unnamed: 0,last_name,year_reg,first_treatment_date,first_treatment_id
0,Doe,2023,2022-01-01,1
1,Smith,2023,2022-02-15,2


PyDough does not support date manipulation. Then, for this context, two years ago is until 2023. 

PyDough:

## Query 29

Return the treatment id, treatment start date, adverse event date and description of all adverse events that occured within 10 days after starting treatment

SQL:

In [90]:
query = """
SELECT 
    t.treatment_id, 
    t.start_dt, 
    ae.reported_dt, 
    ae.description
FROM 
    adverse_events AS ae
JOIN 
    treatments AS t 
    ON ae.treatment_id = t.treatment_id
WHERE 
    ae.reported_dt BETWEEN t.start_dt AND DATE(t.start_dt, '+10 days');
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,treatment_id,start_dt,reported_dt,description
0,4,2022-04-01,2022-04-10,"Severe allergic reaction, hospitalization requ..."
1,14,2023-02-01,2023-02-05,Mild skin rash


PyDough does not support date manipulation. And in this case, we can´t simulate dates because is impossible to add for every ´start_dt´.

## Query 30

Show all placebo treatment id, start and end date, where there concomitant_meds were started within 2 weeks of starting the treatment. also return the start and end dates of all concomitant drug usage.

SQL:

In [91]:
query = """
SELECT 
    t.treatment_id, 
    t.start_dt AS treatment_start_date, 
    t.end_dt AS treatment_end_date, 
    cm.start_dt AS concomitant_med_start_date, 
    cm.end_dt AS concomitant_med_end_date
FROM 
    treatments AS t
JOIN 
    concomitant_meds AS cm 
    ON t.treatment_id = cm.treatment_id
WHERE 
    t.is_placebo = 1
    AND cm.start_dt BETWEEN t.start_dt AND DATE(t.start_dt, '+14 days')
ORDER BY 
    t.treatment_id;
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,treatment_id,treatment_start_date,treatment_end_date,concomitant_med_start_date,concomitant_med_end_date
0,2,2022-02-15,2022-08-14,2022-02-15,2022-03-15
1,7,2022-07-01,2022-12-31,2022-07-15,2022-07-21


PyDough does not support date manipulation. And in this case, we can´t simulate dates because is impossible to add for every ´start_dt´.

## Query 31

What is average age (in integer years) of all registered male patients with private insurance currently?

SQL:

In [92]:
query = """
SELECT 
    AVG(strftime('%Y', 'now') - strftime('%Y', date_of_birth)) AS avg_age
FROM 
    patients
WHERE 
    gender = 'Male' 
    AND ins_type = 'private';
"""

sql_output = pd.read_sql_query(query, connection)
sql_output

Unnamed: 0,avg_age
0,41.0


PyDough:

In [135]:
%%pydough

# We can not manipulate dates in PyDough. Then we are going to use "2025" like current year to calculate patients age, just by year. 
selected_patients = Patients(age=(2025-YEAR(date_of_birth))).WHERE(
    LIKE(gender, "Male")
    & LIKE(ins_type, "private")
)

output = DermTreatment(avg_age=AVG(selected_patients.age))

pydough_output = pydough.to_df(output)
pydough_output

Unnamed: 0,avg_age
0,41.0


In [95]:
dfcompare.compare_df(pydough_output, sql_output, query_category="date_functions", 
                     question="what is average age (in integer years) of all registered male patients with private insurance currently?")

True