In [96]:
import polars as pl
import polars.selectors as cs
import duckdb

In [97]:
con = duckdb.connect('synthea.duckdb')

In [98]:
con.sql('SHOW TABLES')

┌───────────────────────┐
│         name          │
│        varchar        │
├───────────────────────┤
│ patients              │
│ raw_allergies         │
│ raw_careplans         │
│ raw_conditions        │
│ raw_devices           │
│ raw_encounters        │
│ raw_imaging_studies   │
│ raw_immunizations     │
│ raw_medications       │
│ raw_observations      │
│ raw_organizations     │
│ raw_patients          │
│ raw_payer_transitions │
│ raw_payers            │
│ raw_procedures        │
│ raw_providers         │
│ raw_supplies          │
│ stg_encounters        │
│ stg_medications       │
│ stg_patients          │
├───────────────────────┤
│        20 rows        │
└───────────────────────┘

In [99]:
for table in con.sql('SHOW TABLES').pl().select(pl.col("name")).to_series().to_list():
    print(table)
    print(con.sql(f'SELECT COUNT(*) FROM {table}'))

patients
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      3188933 │
└──────────────┘

raw_allergies
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        51592 │
└──────────────┘

raw_careplans
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       377726 │
└──────────────┘

raw_conditions
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      1143900 │
└──────────────┘

raw_devices
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        23694 │
└──────────────┘

raw_encounters
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      3188675 │
└──────────────┘

raw_imaging_studies
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        45609 │
└──────────────┘

raw_immunizations
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       168160 │
└──────────────┘

raw_medications
┌──────────────┐
│ count_star() │
│    int6

In [100]:
con.sql('SELECT COUNT(*), COUNT(DISTINCT patient) FROM patients')

┌──────────────┬─────────────────────────┐
│ count_star() │ count(DISTINCT patient) │
│    int64     │          int64          │
├──────────────┼─────────────────────────┤
│      3188933 │                  124150 │
└──────────────┴─────────────────────────┘

In [101]:
con.sql('SELECT * FROM patients ORDER BY patient LIMIT 25')

┌──────────────────────┬──────────────┬───┬──────────────────────┬──────────────────────┬──────────────────────┐
│       patient        │  first_name  │ … │ medication_payer_c…  │ base_medication_cost │ total_medication_c…  │
│       varchar        │   varchar    │   │        double        │        double        │        double        │
├──────────────────────┼──────────────┼───┼──────────────────────┼──────────────────────┼──────────────────────┤
│ 0001049f-9248-47fe…  │ Katharina121 │ … │                  0.0 │               263.49 │              3161.88 │
│ 0001049f-9248-47fe…  │ Katharina121 │ … │                  0.0 │               263.49 │              3161.88 │
│ 0001049f-9248-47fe…  │ Katharina121 │ … │                  0.0 │               263.49 │              3161.88 │
│ 0001049f-9248-47fe…  │ Katharina121 │ … │                 NULL │                 NULL │                 NULL │
│ 0001049f-9248-47fe…  │ Katharina121 │ … │                 NULL │                 NULL │       

In [102]:
con.sql('SELECT COUNT(*), COUNT(DISTINCT patient) FROM stg_patients')

┌──────────────┬─────────────────────────┐
│ count_star() │ count(DISTINCT patient) │
│    int64     │          int64          │
├──────────────┼─────────────────────────┤
│       124150 │                  124150 │
└──────────────┴─────────────────────────┘

In [103]:
con.sql('SELECT COUNT(*), COUNT(DISTINCT patient), COUNT(DISTINCT encounter) FROM stg_encounters')

┌──────────────┬─────────────────────────┬───────────────────────────┐
│ count_star() │ count(DISTINCT patient) │ count(DISTINCT encounter) │
│    int64     │          int64          │           int64           │
├──────────────┼─────────────────────────┼───────────────────────────┤
│      3188675 │                  123892 │                   3188675 │
└──────────────┴─────────────────────────┴───────────────────────────┘

In [104]:
con.sql('SELECT COUNT(*), COUNT(DISTINCT patient) FROM stg_medications')

┌──────────────┬─────────────────────────┐
│ count_star() │ count(DISTINCT patient) │
│    int64     │          int64          │
├──────────────┼─────────────────────────┤
│      2485952 │                   95529 │
└──────────────┴─────────────────────────┘

In [105]:
con.sql("""
    select count(*)
    
    from stg_patients
    
    left join stg_encounters
        on stg_patients.patient = stg_encounters.patient
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      3188933 │
└──────────────┘

In [106]:
con.sql("""
    select count(*),
        count(distinct stg_patients.patient),
        count(distinct stg_medications.patient)

    from stg_patients

    left join stg_medications
        on stg_patients.patient = stg_medications.patient
""")

┌──────────────┬──────────────────────────────────────┬─────────────────────────────────────────┐
│ count_star() │ count(DISTINCT stg_patients.patient) │ count(DISTINCT stg_medications.patient) │
│    int64     │                int64                 │                  int64                  │
├──────────────┼──────────────────────────────────────┼─────────────────────────────────────────┤
│      2514573 │                               124150 │                                   95529 │
└──────────────┴──────────────────────────────────────┴─────────────────────────────────────────┘

In [107]:
patients = con.sql('SELECT * FROM patients USING SAMPLE 10000').pl()

In [108]:
patients.glimpse()

Rows: 10000
Columns: 29
$ patient                              <str> baffa93f-ae0e-4c33-a9bc-ef0426398436, 2afd2a13-86fa-4c55-bb02-df9572028442, e3c3cfd9-b7bd-4484-8bf7-5a43d20e105e, 8c15fce6-ca2c-45ca-9860-7a7979ef31e0, c5af8ef6-0518-4e1e-b809-8e8b65f83b55, baf8fdef-3fae-4f1f-8788-d1d54d14ab8e, 3d539b5a-fb2c-49d5-87d2-0d4b91dce9e2, 742ade60-4445-4403-9076-226f0907b80b, c3c743ed-086e-457a-b7d6-193607d2b285, d4277c21-b440-49ab-9062-a9de93266ad2
$ first_name                           <str> Collene784, Louis204, Drew592, Herbert830, Carylon722, Alayna598, Wilbur107, Jerome176, Shannan727, Larissa293
$ last_name                            <str> Spencer878, Kilback373, Donnelly343, Willms744, Christiansen251, Spencer878, Kulas532, Wolff180, Schoen8, Stamm704
$ birth_date                          <date> 1941-01-05, 1948-03-12, 1956-12-06, 1921-06-27, 1976-11-17, 1949-04-09, 1934-08-01, 1959-04-11, 1927-10-06, 1949-02-14
$ death_date                          <date> 2018-01-16, 2018-02-27, Non

In [109]:
(
    patients
    .select(cs.numeric())
    .describe()
)

describe,age,healthcare_expenses,healthcare_coverage,encounter_diag_code,BASE_ENCOUNTER_COST,total_encounter_cost,encounter_payer_coverage,medication_code,medication_diag_code,DISPENSES,medication_payer_coverage,base_medication_cost,total_medication_cost
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
"""null_count""",0.0,0.0,0.0,7758.0,0.0,0.0,0.0,2164.0,3030.0,2164.0,2164.0,2164.0,2164.0
"""mean""",65.2183,1130300.0,60264.701766,1305000000000.0,128.818978,128.818978,63.536823,642547.143058,260370000000.0,14.867917,102.544158,287.297241,3486.760655
"""std""",21.828772,576831.060505,156654.140153,11008000000000.0,4.184025,4.184025,40.585773,466630.308085,4868600000000.0,52.649607,410.486387,476.794279,22117.901718
"""min""",0.0,2258.32,0.0,6072007.0,77.49,77.49,0.0,106258.0,10509002.0,1.0,0.0,2.64,2.68
"""25%""",53.0,680085.92,7124.74,55822004.0,129.16,129.16,49.16,310798.0,55822004.0,2.0,0.0,258.15,395.64
"""50%""",67.0,1354000.0,14116.53,72892002.0,129.16,129.16,69.16,429503.0,59621000.0,12.0,0.0,263.49,3161.88
"""75%""",80.0,1546300.0,31979.57,185086009.0,129.16,129.16,89.16,895994.0,59621000.0,12.0,188.49,263.49,3161.88
"""max""",114.0,3716300.0,2146400.0,124170000000000.0,129.16,129.16,129.16,2123111.0,132280000000000.0,985.0,6896.28,6971.28,1378300.0


In [110]:
(
    patients
    .select(pl.col("age"))
    .describe()
)

describe,age
str,f64
"""count""",10000.0
"""null_count""",0.0
"""mean""",65.2183
"""std""",21.828772
"""min""",0.0
"""25%""",53.0
"""50%""",67.0
"""75%""",80.0
"""max""",114.0


In [113]:
con.sql('select patient, round(sum(base_medication_cost)) from patients group by patient order by sum(base_medication_cost) desc limit 10')

┌──────────────────────────────────────┬──────────────────────────────────┐
│               patient                │ round(sum(base_medication_cost)) │
│               varchar                │              double              │
├──────────────────────────────────────┼──────────────────────────────────┤
│ 11a5a0db-d943-4b7b-a68d-fadba6465793 │                         250914.0 │
│ c4f5e145-7b24-4003-bcf9-9f52386fb824 │                         238204.0 │
│ f5f2d860-3b86-4ffa-ab5a-8847440afb4b │                         237154.0 │
│ 80295f40-85cc-4558-b93c-df371d647b91 │                         234002.0 │
│ b9731b1c-a616-41e2-a023-5e2fa4deeefd │                         232988.0 │
│ 4627ba78-4b9c-46ae-a06b-b4e8bdb37b39 │                         231534.0 │
│ 552269a5-6496-4e79-be06-a26a697a1a94 │                         222790.0 │
│ c22fcc5b-57fb-4db0-a723-db4e186c5f7a │                         215118.0 │
│ bdc4268b-ebfe-48ef-89df-d4c4261eca9b │                         211236.0 │
│ 5ea73737-5

# Close connection

In [114]:
con.close()