# Run Synthea

This notebook loads the preprocessed data that needs to be present in the duckdb database before we run 'dbt run', and then it runs the dbt commands needed to build the models in the database.

In [49]:
from pathlib import Path

import polars as pl
import duckdb

# Delete database if it exists since we're reloading anyway

In [50]:
%rm synthea.duckdb

In [51]:
# create a connection
con = duckdb.connect('synthea.duckdb')

# Load duckdb with preprocessed seeds from parquet files

In [52]:
# seed_path = Path('./seeds/')

# for parquet_file in seed_path.glob('*.parquet'):
#     con.sql(
#         f"""
#         CREATE TABLE IF NOT EXISTS raw_{parquet_file.stem} AS 
#         SELECT * FROM read_parquet('{parquet_file}');
#         """
#     )

# Load duckdb with preprocessed seeds from parquet files with full 1 MILLION COVID-19 patients

In [54]:
seed_path = Path('./covid/')

for csv_file in seed_path.glob('*.csv'):
    con.sql(
        f"""
        CREATE TABLE IF NOT EXISTS raw_{csv_file.stem} AS 
        SELECT * FROM read_csv_auto('{csv_file}');
        """
    )

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [55]:
con.sql('SHOW TABLES')

┌───────────────────────┐
│         name          │
│        varchar        │
├───────────────────────┤
│ raw_allergies         │
│ raw_careplans         │
│ raw_conditions        │
│ raw_devices           │
│ raw_encounters        │
│ raw_imaging_studies   │
│ raw_immunizations     │
│ raw_medications       │
│ raw_observations      │
│ raw_organizations     │
│ raw_patients          │
│ raw_payer_transitions │
│ raw_payers            │
│ raw_procedures        │
│ raw_providers         │
│ raw_supplies          │
├───────────────────────┤
│        16 rows        │
└───────────────────────┘

## Close connection

In [56]:
con.close()

# Run dbt commands

In [57]:
%%bash
dbt debug

[0m21:06:51  Running with dbt=1.6.0
[0m21:06:51  dbt version: 1.6.0
[0m21:06:51  python version: 3.10.12
[0m21:06:51  python path: /Users/zacklarsen/mambaforge/envs/dbt_duckdb/bin/python
[0m21:06:51  os info: macOS-13.4-arm64-arm-64bit
[0m21:06:51  Using profiles dir at /Users/zacklarsen/Documents/Documents - Zack’s Mac mini/Projects/dbt-synthea
[0m21:06:51  Using profiles.yml file at /Users/zacklarsen/Documents/Documents - Zack’s Mac mini/Projects/dbt-synthea/profiles.yml
[0m21:06:51  Using dbt_project.yml file at /Users/zacklarsen/Documents/Documents - Zack’s Mac mini/Projects/dbt-synthea/dbt_project.yml
[0m21:06:51  adapter type: duckdb
[0m21:06:51  adapter version: 1.6.0
[0m21:06:51  Configuration:
[0m21:06:51    profiles.yml file [[32mOK found and valid[0m]
[0m21:06:51    dbt_project.yml file [[32mOK found and valid[0m]
[0m21:06:51  Required dependencies:
[0m21:06:51   - git [[32mOK found[0m]

[0m21:06:51  Connection:
[0m21:06:51    database: synthea
[0m21:

In [58]:
%%bash
dbt run

[0m21:06:56  Running with dbt=1.6.0
[0m21:06:56  Registered adapter: duckdb=1.6.0
[0m21:06:56  Found 4 models, 16 seeds, 19 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models
[0m21:06:56  
[0m21:06:56  Concurrency: 1 threads (target='dev')
[0m21:06:56  
[0m21:06:56  1 of 4 START sql view model main.stg_encounters ................................ [RUN]
[0m21:06:56  1 of 4 OK created sql view model main.stg_encounters ........................... [[32mOK[0m in 0.06s]
[0m21:06:56  2 of 4 START sql view model main.stg_medications ............................... [RUN]
[0m21:06:56  2 of 4 OK created sql view model main.stg_medications .......................... [[32mOK[0m in 0.03s]
[0m21:06:56  3 of 4 START sql view model main.stg_patients .................................. [RUN]
[0m21:06:56  3 of 4 OK created sql view model main.stg_patients ............................. [[32mOK[0m in 0.03s]
[0m21:06:56  4 of 4 START sql table model main.pati

In [59]:
con = duckdb.connect('synthea.duckdb')

In [60]:
con.sql('SHOW TABLES')

┌───────────────────────┐
│         name          │
│        varchar        │
├───────────────────────┤
│ patients              │
│ raw_allergies         │
│ raw_careplans         │
│ raw_conditions        │
│ raw_devices           │
│ raw_encounters        │
│ raw_imaging_studies   │
│ raw_immunizations     │
│ raw_medications       │
│ raw_observations      │
│ raw_organizations     │
│ raw_patients          │
│ raw_payer_transitions │
│ raw_payers            │
│ raw_procedures        │
│ raw_providers         │
│ raw_supplies          │
│ stg_encounters        │
│ stg_medications       │
│ stg_patients          │
├───────────────────────┤
│        20 rows        │
└───────────────────────┘

In [61]:
con.sql('select count(*) from patients')

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      3188933 │
└──────────────┘

In [62]:
con.sql('select * from patients limit 10')

┌──────────────────────┬────────────┬───┬──────────────────────┬──────────────────────┬──────────────────────┐
│       patient        │ first_name │ … │ medication_payer_c…  │ base_medication_cost │ total_medication_c…  │
│       varchar        │  varchar   │   │        double        │        double        │        double        │
├──────────────────────┼────────────┼───┼──────────────────────┼──────────────────────┼──────────────────────┤
│ 4336e18c-8c0b-48d5…  │ Erwin847   │ … │               223.49 │               263.49 │              3161.88 │
│ 56f31489-d62a-4f97…  │ Carmine137 │ … │                  0.0 │                19.33 │              2764.19 │
│ f830c73b-fd2b-451a…  │ Lorena247  │ … │                  0.0 │               263.49 │              3161.88 │
│ 27d7982f-ae30-43e2…  │ Felisha640 │ … │                  0.0 │               263.49 │              2898.39 │
│ be0cf070-4814-44af…  │ Antonio44  │ … │                  0.0 │               263.49 │              3161.88 │
│

In [63]:
con.close()