# Demo CMS + DuckDB

In [2]:
import pandas as pd 
import numpy as np
import duckdb

## Start connection to DuckDB

In [3]:
conn = duckdb.connect("../datapond/cms.db")

In [3]:
conn.table('mbsf').show()

┌──────────────────┬───────────────┬───────────────────┬──────────────┐
│   DESYNPUF_ID    │ BENE_BIRTH_DT │ BENE_SEX_IDENT_CD │ BENE_RACE_CD │
│     varchar      │     date      │       int32       │    int32     │
├──────────────────┼───────────────┼───────────────────┼──────────────┤
│ 0002612EC022F8CF │ 1936-07-01    │                 1 │            5 │
│ 000F4AA60BEEE311 │ 1930-03-01    │                 2 │            1 │
│ 001346ECBC5A3E04 │ 1943-08-01    │                 2 │            2 │
│ 001A2F5A32636C93 │ 1927-05-01    │                 1 │            1 │
│ 00254477082A125A │ 1941-11-01    │                 1 │            1 │
│ 002815D945877635 │ 1954-05-01    │                 1 │            1 │
│ 00336B4C947E4F83 │ 1949-02-01    │                 2 │            1 │
│ 00347CF73A6BE3CF │ 1931-09-01    │                 1 │            1 │
│ 004835EA541AF6A9 │ 1941-09-01    │                 2 │            1 │
│ 004D9491E91B1CCD │ 1941-09-01    │                 1 │        

In [4]:
conn.table('enroll').show()

┌──────────────────┬───────┬───────────────┬──────────────────────────┐
│   DESYNPUF_ID    │ YEAR  │ SP_STATE_CODE │ BENE_HMO_CVRAGE_TOT_MONS │
│     varchar      │ int32 │     int32     │          int32           │
├──────────────────┼───────┼───────────────┼──────────────────────────┤
│ 000102649ED5601B │  2008 │            49 │                        0 │
│ 0002278C944E240A │  2008 │            34 │                        0 │
│ 000330E625C93700 │  2008 │             4 │                        0 │
│ 000374D5E110EDA6 │  2008 │             5 │                        0 │
│ 0003950E4B4FEC8D │  2008 │            19 │                        0 │
│ 000418576FBD8D9F │  2008 │            45 │                        0 │
│ 0004A3FCF0BA5D32 │  2008 │            44 │                        0 │
│ 0005B57E395C7B77 │  2008 │             1 │                        0 │
│ 0005FE4AE779FCD7 │  2008 │            15 │                       12 │
│ 00064590936C6AC9 │  2008 │            45 │                    

In [5]:
conn.table('hosp').show()

┌─────────────────┬──────────────────┬──────────────┐
│     CLM_ID      │   DESYNPUF_ID    │ CLM_ADMSN_DT │
│     varchar     │     varchar      │     date     │
├─────────────────┼──────────────────┼──────────────┤
│ 744701196219777 │ 6A7F8ACAFA99992C │ 2008-09-24   │
│ 744791196262015 │ 7BA6BA4A5360BE9B │ 2010-03-27   │
│ 744331196244951 │ 017965E46672A153 │ 2008-01-30   │
│ 744451196246617 │ 2676FD382E8E88A3 │ 2009-05-20   │
│ 744871196211802 │ 3AA3D5E0A54A4FA8 │ 2009-03-09   │
│ 744851196234501 │ 49FC50208172D341 │ 2009-02-25   │
│ 744881196240908 │ 74616556359DD790 │ 2008-05-04   │
│ 744011196252551 │ 79132022F3A6AEA8 │ 2009-02-26   │
│ 744471196228201 │ 00DEC1531EBD914D │ 2008-01-26   │
│ 744261196232408 │ 0C3E240D702EC8F9 │ 2008-01-06   │
│        ·        │        ·         │     ·        │
│        ·        │        ·         │     ·        │
│        ·        │        ·         │     ·        │
│ 241571127006153 │ AF094B47134FACCB │ 2008-09-02   │
│ 241681127025063 │ B3D04C73

## Queries

4. I will obtain all-cause hospitalization counts for each strata (sex, race, age prespecified groups)

In [25]:
conn.execute("""
SELECT
    h.CLM_ID,
    h.YEAR,
    h.DESYNPUF_ID,
    CASE WHEN b.age < 65 THEN '<65'
        WHEN b.age BETWEEN 65 AND 75 THEN '65-75'
        WHEN b.age BETWEEN 75 AND 85 THEN '75-85'
        ELSE '>85'
    END AS AGE_GROUP,
    b.BENE_SEX_IDENT_CD, 
    b.BENE_RACE_CD
FROM (
    SELECT
        CLM_ID,
        YEAR(CLM_ADMSN_DT) AS YEAR, 
        DESYNPUF_ID
    FROM hosp
) h
LEFT JOIN (
    SELECT 
        m.DESYNPUF_ID, 
        e.YEAR,
        e.YEAR - YEAR(m.BENE_BIRTH_DT) AS AGE, 
        m.BENE_SEX_IDENT_CD,
        m.BENE_RACE_CD
    FROM mbsf m INNER JOIN enroll e
    ON m.DESYNPUF_ID=e.DESYNPUF_ID
) b
ON h.DESYNPUF_ID=b.DESYNPUF_ID AND h.YEAR=b.YEAR
""").fetch_df()

Unnamed: 0,CLM_ID,YEAR,DESYNPUF_ID,AGE_GROUP,BENE_SEX_IDENT_CD,BENE_RACE_CD
0,992811161653536,2008,000B8723158A16DA,65-75,2.0,1.0
1,992401161600831,2008,000CE3CED9C9DEFC,>85,1.0,1.0
2,992181161642783,2008,00103749AB3FE139,75-85,2.0,1.0
3,992261161634571,2008,0014E5490986F7B6,65-75,2.0,1.0
4,992771161603158,2008,001A9EF6C406F486,65-75,1.0,1.0
...,...,...,...,...,...,...
1331528,790051146246190,2007,E825F1F76946CFB9,>85,,
1331529,939561173182760,2007,8FE34E77EBF55C35,>85,,
1331530,196481177008842,2010,8A01BE17E4AFE93E,>85,,
1331531,196761176981562,2010,A6172EC3C0985454,>85,,


In [26]:
conn.execute("""
WITH hospitalizations AS (
    SELECT
        h.CLM_ID,
        h.YEAR,
        h.DESYNPUF_ID,
        CASE WHEN b.age < 65 THEN '<65'
            WHEN b.age BETWEEN 65 AND 75 THEN '65-75'
            WHEN b.age BETWEEN 75 AND 85 THEN '75-85'
            ELSE '>85'
        END AS AGE_GROUP,
        b.BENE_SEX_IDENT_CD, 
        b.BENE_RACE_CD
    FROM (
        SELECT
            CLM_ID,
            YEAR(CLM_ADMSN_DT) AS YEAR, 
            DESYNPUF_ID
        FROM hosp
    ) h
    LEFT JOIN (
        SELECT 
            m.DESYNPUF_ID, 
            e.YEAR,
            e.YEAR - YEAR(m.BENE_BIRTH_DT) AS AGE, 
            m.BENE_SEX_IDENT_CD,
            m.BENE_RACE_CD
        FROM mbsf m INNER JOIN enroll e
        ON m.DESYNPUF_ID=e.DESYNPUF_ID
    ) b
    ON h.DESYNPUF_ID=b.DESYNPUF_ID AND h.YEAR=b.YEAR
)
SELECT 
    AGE_GROUP,
    BENE_SEX_IDENT_CD, 
    BENE_RACE_CD, 
    COUNT(CLM_ID) as n_hospitalizations
FROM
    hospitalizations
GROUP BY
    AGE_GROUP,
    BENE_SEX_IDENT_CD, 
    BENE_RACE_CD
""").fetch_df()

Unnamed: 0,AGE_GROUP,BENE_SEX_IDENT_CD,BENE_RACE_CD,n_hospitalizations
0,65-75,2.0,1.0,213158
1,65-75,1.0,1.0,183262
2,75-85,2.0,1.0,204272
3,<65,1.0,1.0,77259
4,>85,2.0,1.0,132371
5,>85,1.0,2.0,4055
6,75-85,1.0,1.0,149085
7,65-75,2.0,2.0,26352
8,<65,2.0,3.0,3885
9,<65,2.0,2.0,24045


5. I will obtain all-cause hospitalization county prevalence (proportion)

In [32]:
conn.execute("""
SELECT
    h.DESYNPUF_ID, 
    h.YEAR, 
    b.SP_STATE_CODE
FROM (
    SELECT
        CLM_ID,
        DESYNPUF_ID, 
        YEAR(hosp.CLM_ADMSN_DT) AS YEAR
    FROM hosp
) h INNER JOIN (
    SELECT 
        m.DESYNPUF_ID, 
        e.YEAR,
        e.SP_STATE_CODE
    FROM mbsf m INNER JOIN enroll e 
    ON m.DESYNPUF_ID=e.DESYNPUF_ID
) b
ON h.DESYNPUF_ID=b.DESYNPUF_ID AND h.YEAR=b.YEAR
""").fetch_df()

Unnamed: 0,DESYNPUF_ID,YEAR,SP_STATE_CODE
0,000B8723158A16DA,2008,49
1,000CE3CED9C9DEFC,2008,33
2,00103749AB3FE139,2008,36
3,001A9EF6C406F486,2008,33
4,0023D900E21C858E,2008,39
...,...,...,...
1313641,FFEEEB9518CCA6AA,2009,3
1313642,FFE86ABB4E999544,2009,37
1313643,FFEF2034E7D78542,2009,50
1313644,FFFB5B13E2D0009C,2009,37


In [33]:
conn.execute("""
SELECT
    COUNT(CLM_ID) AS n_hospitalizations,  
    SP_STATE_CODE
FROM (
    SELECT
        CLM_ID,
        DESYNPUF_ID, 
        YEAR(hosp.CLM_ADMSN_DT) AS YEAR
    FROM hosp
) h INNER JOIN (
    SELECT 
        m.DESYNPUF_ID, 
        e.YEAR,
        e.SP_STATE_CODE
    FROM mbsf m INNER JOIN enroll e 
    ON m.DESYNPUF_ID=e.DESYNPUF_ID
) b
ON h.DESYNPUF_ID=b.DESYNPUF_ID AND h.YEAR=b.YEAR
GROUP BY
    SP_STATE_CODE
""").fetch_df()

Unnamed: 0,n_hospitalizations,SP_STATE_CODE
0,26509,1
1,1756,2
2,20708,3
3,16988,4
4,101870,5
5,14831,6
6,16378,7
7,4830,8
8,2193,9
9,92391,10


In [35]:
conn.execute("""
SELECT 
    COUNT(m.DESYNPUF_ID) AS n_beneficiaries, 
    e.SP_STATE_CODE
FROM 
    mbsf m INNER JOIN enroll e 
ON 
    m.DESYNPUF_ID=e.DESYNPUF_ID
GROUP BY
    e.SP_STATE_CODE
""").fetch_df()

Unnamed: 0,n_beneficiaries,SP_STATE_CODE
0,149143,1
1,13220,2
2,133049,3
3,106970,4
4,585895,5
5,114409,6
6,87687,7
7,28950,8
8,17568,9
9,450389,10


In [45]:
conn.execute("""
WITH num AS(
    SELECT
        COUNT(CLM_ID) AS n_hospitalizations,  
        SP_STATE_CODE
    FROM (
        SELECT
            CLM_ID,
            DESYNPUF_ID, 
            YEAR(hosp.CLM_ADMSN_DT) AS YEAR
        FROM hosp
    ) h INNER JOIN (
        SELECT 
            m.DESYNPUF_ID, 
            e.YEAR,
            e.SP_STATE_CODE
        FROM mbsf m INNER JOIN enroll e 
        ON m.DESYNPUF_ID=e.DESYNPUF_ID
    ) b
    ON h.DESYNPUF_ID=b.DESYNPUF_ID AND h.YEAR=b.YEAR
    GROUP BY
        SP_STATE_CODE),
denom AS (
    SELECT 
        COUNT(m.DESYNPUF_ID) AS n_beneficiaries, 
        e.SP_STATE_CODE
    FROM 
        mbsf m INNER JOIN enroll e 
    ON 
        m.DESYNPUF_ID=e.DESYNPUF_ID
    GROUP BY
        e.SP_STATE_CODE
)
SELECT
    n_hospitalizations,
    n_beneficiaries,
    n_hospitalizations * 100 / n_beneficiaries AS prevalence_porcentage,
    denom.SP_STATE_CODE
FROM num INNER JOIN denom ON denom.SP_STATE_CODE = num.SP_STATE_CODE
""").fetch_df()

Unnamed: 0,n_hospitalizations,n_beneficiaries,prevalence,SP_STATE_CODE
0,26509,149143,17,1
1,1756,13220,13,2
2,20708,133049,15,3
3,16988,106970,15,4
4,101870,585895,17,5
5,14831,114409,12,6
6,16378,87687,18,7
7,4830,28950,16,8
8,2193,17568,12,9
9,92391,450389,20,10
