# Demo CMS + DuckDB

In [1]:
import pandas as pd 
import numpy as np
import duckdb

## Start connection to DuckDB

In [2]:
conn = duckdb.connect()

## Design DB + parquet storage

<img src="diagram.png" width="500">

In [3]:
conn.sql("CREATE TABLE mbsf(DESYNPUF_ID VARCHAR(20) PRIMARY KEY, BENE_BIRTH_DT DATE, BENE_SEX_IDENT_CD INT, BENE_RACE_CD INT)")
conn.sql("CREATE TABLE enroll(DESYNPUF_ID VARCHAR(20), YEAR INT, PRIMARY KEY (DESYNPUF_ID, YEAR), SP_STATE_CODE INT, BENE_HMO_CVRAGE_TOT_MONS INT)")
conn.sql("CREATE TABLE hosp(CLM_ID VARCHAR(20) PRIMARY KEY, DESYNPUF_ID VARCHAR(20), CLM_ADMSN_DT DATE)")

## Queries

1. Prepare beneficiaries table

In [4]:
mbsf = conn.execute("""
WITH bene AS (
    SELECT
        DESYNPUF_ID,
        strptime(BENE_BIRTH_DT, '%Y%m%d') as BENE_BIRTH_DT,
        strptime(BENE_DEATH_DT, '%Y%m%d') as BENE_DEATH_DT, 
        BENE_SEX_IDENT_CD, 
        BENE_RACE_CD
    FROM '../datapond/csv/mbsf/*.csv')
SELECT 
    DESYNPUF_ID,
    MIN(BENE_BIRTH_DT) as BENE_BIRTH_DT,
    BENE_SEX_IDENT_CD, 
    BENE_RACE_CD
FROM
    bene
GROUP BY
    DESYNPUF_ID, BENE_SEX_IDENT_CD, BENE_RACE_CD
""").fetch_df()

2. Prepare enrollments table

In [5]:
enroll = conn.execute("""
SELECT DESYNPUF_ID, 2008 AS year, SP_STATE_CODE, BENE_HMO_CVRAGE_TOT_MONS FROM '../datapond/csv/mbsf/DE1_0_2008_Beneficiary_Summary_File_*.csv'
UNION ALL
SELECT DESYNPUF_ID, 2009 AS year, SP_STATE_CODE, BENE_HMO_CVRAGE_TOT_MONS FROM '../datapond/csv/mbsf/DE1_0_2009_Beneficiary_Summary_File_*.csv'
UNION ALL
SELECT DESYNPUF_ID, 2010 AS year, SP_STATE_CODE, BENE_HMO_CVRAGE_TOT_MONS FROM '../datapond/csv/mbsf/DE1_0_2010_Beneficiary_Summary_File_*.csv'
""").fetch_df()

3. Prepare hospitalizations table

In [15]:
hosp = conn.execute("""
WITH adm AS (
    SELECT DISTINCT
    CLM_ID,
    DESYNPUF_ID,
    strptime(CLM_ADMSN_DT, '%Y%m%d') as CLM_ADMSN_DT
FROM '../datapond/csv/inpatient/DE1_0_2008_to_2010_Inpatient_Claims_*.csv'
)
SELECT 
    CLM_ID,
    ANY_VALUE(DESYNPUF_ID) AS DESYNPUF_ID,
    MIN(CLM_ADMSN_DT) as CLM_ADMSN_DT
FROM
    adm
GROUP BY
    CLM_ID
""").fetchdf()

In [7]:
mbsf.to_parquet('../datapond/mbsf.parquet')
enroll.to_parquet('../datapond/enroll.parquet')
hosp.to_parquet('../datapond/hosp.parquet')