In [1]:
import pandas as pd
import numpy as np

import time

import sqlalchemy
import psycopg2

from tqdm import tqdm
from datetime import datetime

pd.set_option('display.max_columns', 500)   # to display 500 columns
pd.set_option('display.max_rows', 500) # to display 500 rows

In [2]:
#![](FileStructure.png)
#from IPython.display import Image
#Image(filename='FileStructure.png')

In [3]:
#import psycopg2

# DSN (data source name) format for database connections:  
# [protocol / database  name]://[username]:[password]@[hostname / ip]:[port]/[database name here]

# on your computer you are the user postgres (full administrative access)
db_user = 'postgres'
# if you need a password to access a database, put it here
db_password = ''
# on your computer, use localhost
db_host = 'localhost'
# the default port for postgres is 5432
db_port = 5432
# we want to connect to the northwind database
database =   'cms_claims' #'cms_medicare_claims'  #

conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [4]:
# Check Tables in the Database
query = """
SELECT tablename 
FROM pg_catalog.pg_tables 
WHERE schemaname='public'
"""

pd.read_sql(query, con=conn)

Unnamed: 0,tablename
0,icd9_diagonsis
1,icd9_procedures
2,hcpcs
3,beneficiary2008
4,beneficiary2009
5,beneficiary2010
6,inpatient_claims
7,prescription_drug_events
8,outpatient_claims


In [5]:
# To view All tables and All columns
''' query = """
SELECT table_name, column_name, data_type, table_schema
FROM information_schema.columns
WHERE table_schema = 'public'
order by table_name
"""
pd.read_sql(query, con=conn) '''

' query = """\nSELECT table_name, column_name, data_type, table_schema\nFROM information_schema.columns\nWHERE table_schema = \'public\'\norder by table_name\n"""\npd.read_sql(query, con=conn) '

#### Function

In [5]:
def query_func(query, conn):
    df = pd.read_sql(query , con=conn)
    return df

### Data processing & cleaning (Datetime conversion) for Benefeciary tables

In [7]:
q = '''SELECT * FROM beneficiary2008 '''

Beneficiary2008DF = query_func(q, conn)
Beneficiary2008DF.head(2)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,SP_CNCR,SP_COPD,SP_DEPRESSN,SP_DIABETES,SP_ISCHMCHT,SP_OSTEOPRS,SP_RA_OA,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00013D2EFD8E45D1,1923-05-01,NaT,1,1,0,26,950,12,12,12,12,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,50.0,10.0,0.0,0.0,0.0,0.0
1,00016F745862898F,1943-01-01,NaT,1,1,0,39,230,12,12,0,0,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,700.0,240.0,0.0


In [8]:
print(Beneficiary2008DF.shape)
Beneficiary2008DF.dtypes

(232747, 32)


DESYNPUF_ID                         object
BENE_BIRTH_DT               datetime64[ns]
BENE_DEATH_DT               datetime64[ns]
BENE_SEX_IDENT_CD                    int64
BENE_RACE_CD                         int64
BENE_ESRD_IND                       object
SP_STATE_CODE                        int64
BENE_COUNTY_CD                       int64
BENE_HI_CVRAGE_TOT_MONS              int64
BENE_SMI_CVRAGE_TOT_MONS             int64
BENE_HMO_CVRAGE_TOT_MONS             int64
PLAN_CVRG_MOS_NUM                    int64
SP_ALZHDMTA                          int64
SP_CHF                               int64
SP_CHRNKIDN                          int64
SP_CNCR                              int64
SP_COPD                              int64
SP_DEPRESSN                          int64
SP_DIABETES                          int64
SP_ISCHMCHT                          int64
SP_OSTEOPRS                          int64
SP_RA_OA                             int64
SP_STRKETIA                          int64
MEDREIMB_IP

In [9]:
Beneficiary2008DF.SP_ISCHMCHT.value_counts(normalize=True)  # 1 means have disease, 2 means no disease

2    0.579939
1    0.420061
Name: SP_ISCHMCHT, dtype: float64

In [10]:
q = '''SELECT * FROM beneficiary2009 '''

Beneficiary2009DF = query_func(q, conn)
Beneficiary2009DF.head(2)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,SP_CNCR,SP_COPD,SP_DEPRESSN,SP_DIABETES,SP_ISCHMCHT,SP_OSTEOPRS,SP_RA_OA,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00013D2EFD8E45D1,1923-05-01,NaT,1,1,0,26,950,12,12,12,12,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,100.0,20.0,0.0
1,00016F745862898F,1943-01-01,NaT,1,1,0,39,230,12,12,0,10,1,2,1,2,2,1,1,1,2,1,1,36000.0,3204.0,0.0,60.0,70.0,0.0,1350.0,530.0,100.0


In [11]:
print(Beneficiary2009DF.shape)
Beneficiary2009DF.dtypes

(229156, 32)


DESYNPUF_ID                         object
BENE_BIRTH_DT               datetime64[ns]
BENE_DEATH_DT               datetime64[ns]
BENE_SEX_IDENT_CD                    int64
BENE_RACE_CD                         int64
BENE_ESRD_IND                       object
SP_STATE_CODE                        int64
BENE_COUNTY_CD                       int64
BENE_HI_CVRAGE_TOT_MONS              int64
BENE_SMI_CVRAGE_TOT_MONS             int64
BENE_HMO_CVRAGE_TOT_MONS             int64
PLAN_CVRG_MOS_NUM                    int64
SP_ALZHDMTA                          int64
SP_CHF                               int64
SP_CHRNKIDN                          int64
SP_CNCR                              int64
SP_COPD                              int64
SP_DEPRESSN                          int64
SP_DIABETES                          int64
SP_ISCHMCHT                          int64
SP_OSTEOPRS                          int64
SP_RA_OA                             int64
SP_STRKETIA                          int64
MEDREIMB_IP

In [12]:
q = '''SELECT * FROM beneficiary2010 '''

Beneficiary2010DF = query_func(q, conn)
Beneficiary2010DF.head(2)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,SP_CNCR,SP_COPD,SP_DEPRESSN,SP_DIABETES,SP_ISCHMCHT,SP_OSTEOPRS,SP_RA_OA,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,000002F7E0A96C32,1919-07-01,NaT,2,2,0,5,400,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00001C24EE7B06AC,1936-05-01,NaT,1,1,0,11,530,12,12,0,12,2,2,2,2,2,2,2,1,2,2,2,0.0,0.0,0.0,500.0,0.0,0.0,90.0,60.0,0.0


In [13]:
print(Beneficiary2010DF.shape)
Beneficiary2010DF.dtypes

(225656, 32)


DESYNPUF_ID                         object
BENE_BIRTH_DT               datetime64[ns]
BENE_DEATH_DT               datetime64[ns]
BENE_SEX_IDENT_CD                    int64
BENE_RACE_CD                         int64
BENE_ESRD_IND                       object
SP_STATE_CODE                        int64
BENE_COUNTY_CD                       int64
BENE_HI_CVRAGE_TOT_MONS              int64
BENE_SMI_CVRAGE_TOT_MONS             int64
BENE_HMO_CVRAGE_TOT_MONS             int64
PLAN_CVRG_MOS_NUM                    int64
SP_ALZHDMTA                          int64
SP_CHF                               int64
SP_CHRNKIDN                          int64
SP_CNCR                              int64
SP_COPD                              int64
SP_DEPRESSN                          int64
SP_DIABETES                          int64
SP_ISCHMCHT                          int64
SP_OSTEOPRS                          int64
SP_RA_OA                             int64
SP_STRKETIA                          int64
MEDREIMB_IP

In [14]:
q = '''SELECT * FROM icd9_diagonsis '''

ICD9_DiagonsisDF = query_func(q, conn)
ICD9_DiagonsisDF.head(2)

Unnamed: 0,diagnosis_cd,long_desc,short_desc
0,10,Cholera due to vibrio cholerae,Cholera d/t vib cholerae
1,11,Cholera due to vibrio cholerae el tor,Cholera d/t vib el tor


In [15]:
print(ICD9_DiagonsisDF.shape)
ICD9_DiagonsisDF.dtypes

(14572, 3)


diagnosis_cd    object
long_desc       object
short_desc      object
dtype: object

In [16]:
q = '''SELECT * FROM icd9_procedures '''

ICD9_ProcedureDF = query_func(q, conn)
ICD9_ProcedureDF.tail(2)

Unnamed: 0,procedure_cd,long_desc,short_desc
3885,,,
3886,,,


In [17]:
print(ICD9_ProcedureDF.shape)
ICD9_ProcedureDF.dtypes

(3887, 3)


procedure_cd    object
long_desc       object
short_desc      object
dtype: object

In [18]:
q = '''SELECT * FROM hcpcs '''

hcpcsDF = query_func(q, conn)
hcpcsDF.head(2)

Unnamed: 0,HCPCS,DESCRIPTION
0,A0021,Outside state ambulance serv
1,A0080,Noninterest escort in non er


In [19]:
print(hcpcsDF.shape)
hcpcsDF.dtypes

(13040, 2)


HCPCS          object
DESCRIPTION    object
dtype: object

### Data processing & cleaning for outpatient_claims

In [20]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 49, 25, 781831)

In [21]:
q = '''SELECT * FROM outpatient_claims '''

OutpatientClaimsDF = query_func(q, conn)
OutpatientClaimsDF.head(3)

Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00016F745862898F,542282281644416,1,2009-06-23,2009-06-23,3939PG,30.0,0.0,5737808000.0,,5737808000.0,0.0,9594,E9174,4019.0,,,,,,,,,,,,,,0.0,70.0,,71101,78480,94060.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [22]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.dtypes

(1583352, 76)


DESYNPUF_ID                               object
CLM_ID                                     int64
SEGMENT                                    int64
CLM_FROM_DT                       datetime64[ns]
CLM_THRU_DT                       datetime64[ns]
PRVDR_NUM                                 object
CLM_PMT_AMT                              float64
NCH_PRMRY_PYR_CLM_PD_AMT                 float64
AT_PHYSN_NPI                             float64
OP_PHYSN_NPI                             float64
OT_PHYSN_NPI                             float64
NCH_BENE_BLOOD_DDCTBL_LBLTY_AM           float64
ICD9_DGNS_CD_1                            object
ICD9_DGNS_CD_2                            object
ICD9_DGNS_CD_3                            object
ICD9_DGNS_CD_4                            object
ICD9_DGNS_CD_5                            object
ICD9_DGNS_CD_6                            object
ICD9_DGNS_CD_7                            object
ICD9_DGNS_CD_8                            object
ICD9_DGNS_CD_9      

In [23]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 50, 20, 849513)

#### Adding lookup for 10 Diagnostic codes for outpatient claims

In [24]:
q = '''SELECT  
            OPD."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc,
            ICD9D9.long_desc as DGNS_CD_9_desc,
            ICD9D10.long_desc as DGNS_CD_10_desc
                 
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN icd9_diagonsis as ICD9D1 ON OPD."ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON OPD."ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON OPD."ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON OPD."ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON OPD."ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON OPD."ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON OPD."ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON OPD."ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D9 ON OPD."ICD9_DGNS_CD_9" = ICD9D9.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D10 ON OPD."ICD9_DGNS_CD_10" = ICD9D10.diagnosis_cd
            ;
    '''

ICD_descDF_outpt = query_func(q, conn)
ICD_descDF_outpt.head(1)

Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,Encounter for planned post-operative wound clo...,,,,,,,,,


In [25]:
print(ICD_descDF_outpt.shape)
ICD_descDF_outpt.head(2)

(1583352, 11)


Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,Encounter for planned post-operative wound clo...,,,,,,,,,
1,00016F745862898F,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,


In [26]:
#print(OutpatientClaimsDF.shape)
#OutpatientClaimsDF.head(10)

In [27]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 50, 31, 143397)

#### Procedure code lookup for outpatient claims

In [28]:
q = '''SELECT  
            OPD."DESYNPUF_ID", 
            ICD9P1.long_desc as PRCDR_CD_1_desc,
            ICD9P2.long_desc as PRCDR_CD_2_desc,
            ICD9P3.long_desc as PRCDR_CD_3_desc,
            ICD9P4.long_desc as PRCDR_CD_4_desc,
            ICD9P5.long_desc as PRCDR_CD_5_desc,
            ICD9P6.long_desc as PRCDR_CD_6_desc
                 
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN icd9_procedures as ICD9P1 ON CAST(OPD."ICD9_PRCDR_CD_1" as varchar) = CAST(ICD9P1.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P2 ON CAST(OPD."ICD9_PRCDR_CD_2" as varchar) = CAST(ICD9P2.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P3 ON CAST(OPD."ICD9_PRCDR_CD_3" as varchar) = CAST(ICD9P3.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P4 ON CAST(OPD."ICD9_PRCDR_CD_4" as varchar) = CAST(ICD9P4.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P5 ON CAST(OPD."ICD9_PRCDR_CD_5" as varchar) = CAST(ICD9P5.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P6 ON CAST(OPD."ICD9_PRCDR_CD_6" as varchar) = CAST(ICD9P6.procedure_cd as varchar)
            ;
    '''

ICDProc_descDF_outpt = query_func(q, conn)
ICDProc_descDF_outpt.head(2)

Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,,,,,,
1,00016F745862898F,,,,,,


In [29]:
print(ICDProc_descDF_outpt.shape)
ICDProc_descDF_outpt.head(2)

(1583352, 7)


Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,,,,,,
1,00016F745862898F,,,,,,


In [30]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 50, 35, 682298)

#### HCPCS code lookup for outpatient claims

In [31]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 50, 35, 687506)

In [32]:
q = '''SELECT 
            OPD."DESYNPUF_ID", 
            h1."DESCRIPTION" as hcpcs_CD_1_desc,
            h2."DESCRIPTION" as hcpcs_CD_2_desc,
            h3."DESCRIPTION" as hcpcs_CD_3_desc,
            h4."DESCRIPTION" as hcpcs_CD_4_desc,
            h5."DESCRIPTION" as hcpcs_CD_5_desc,
            h6."DESCRIPTION" as hcpcs_CD_6_desc,
            h7."DESCRIPTION" as hcpcs_CD_7_desc,
            h8."DESCRIPTION" as hcpcs_CD_8_desc,
            h9."DESCRIPTION" as hcpcs_CD_9_desc,
            h10."DESCRIPTION" as hcpcs_CD_10_desc,
            h11."DESCRIPTION" as hcpcs_CD_11_desc
           
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN hcpcs as h1 ON CAST(OPD."HCPCS_CD_1" as varchar) = CAST(h1."HCPCS" as varchar)
            LEFT JOIN hcpcs as h2 ON CAST(OPD."HCPCS_CD_2" as varchar) = CAST(h2."HCPCS" as varchar)
            LEFT JOIN hcpcs as h3 ON CAST(OPD."HCPCS_CD_3" as varchar) = CAST(h3."HCPCS" as varchar)
            LEFT JOIN hcpcs as h4 ON CAST(OPD."HCPCS_CD_4" as varchar) = CAST(h4."HCPCS" as varchar)
            LEFT JOIN hcpcs as h5 ON CAST(OPD."HCPCS_CD_5" as varchar) = CAST(h5."HCPCS" as varchar)
            LEFT JOIN hcpcs as h6 ON CAST(OPD."HCPCS_CD_6" as varchar) = CAST(h6."HCPCS" as varchar)
            LEFT JOIN hcpcs as h7 ON CAST(OPD."HCPCS_CD_7" as varchar) = CAST(h7."HCPCS" as varchar)
            LEFT JOIN hcpcs as h8 ON CAST(OPD."HCPCS_CD_8" as varchar) = CAST(h8."HCPCS" as varchar)
            LEFT JOIN hcpcs as h9 ON CAST(OPD."HCPCS_CD_9" as varchar) = CAST(h9."HCPCS" as varchar)
            LEFT JOIN hcpcs as h10 ON CAST(OPD."HCPCS_CD_10" as varchar) = CAST(h10."HCPCS" as varchar)
            LEFT JOIN hcpcs as h11 ON CAST(OPD."HCPCS_CD_11" as varchar) = CAST(h11."HCPCS" as varchar)
            ;    
    '''    

hcpcsDF_outpt = query_func(q, conn)
hcpcsDF_outpt.head(2)

Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,Prothrombin time,"Assay of psa, total",,,,,,,,,
1,00016F745862898F,Prothrombin time,Metabolic panel total ca,Lipid panel,"Vitamin d, 25 hydroxy","Ther/proph/diag inj, sc/im",Urine bacteria culture,Complete cbc w/auto diff wbc,Hepatic function panel,Assay alkaline phosphatase,,


In [33]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 50, 48, 781920)

In [34]:
print(hcpcsDF_outpt.shape)
hcpcsDF_outpt.head(5)

(1583352, 12)


Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,Prothrombin time,"Assay of psa, total",,,,,,,,,
1,00016F745862898F,Prothrombin time,Metabolic panel total ca,Lipid panel,"Vitamin d, 25 hydroxy","Ther/proph/diag inj, sc/im",Urine bacteria culture,Complete cbc w/auto diff wbc,Hepatic function panel,Assay alkaline phosphatase,,
2,00016F745862898F,X-ray exam of ribs/chest,,Evaluation of wheezing,,,,,,,,
3,0001FDD721E223DC,Routine venipuncture,,,,,,,,,,
4,00024B3D2352D2D0,"Us, transrectal",Calculus spectroscopy,,,,,,,,,


In [35]:
#duplicateRowsDF = hcpcsDF_outpt[hcpcsDF_outpt.duplicated(keep='last')]
#duplicateRowsDF.shape

#### Merging ICD diagnostic description to outpatient claims DF

In [36]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

(1583352, 76)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [37]:
# checking Indexes 

OutpatientClaimsDF.index, ICD_descDF_outpt.index, ICDProc_descDF_outpt.index, hcpcsDF_outpt.index

(RangeIndex(start=0, stop=1583352, step=1),
 RangeIndex(start=0, stop=1583352, step=1),
 RangeIndex(start=0, stop=1583352, step=1),
 RangeIndex(start=0, stop=1583352, step=1))

In [38]:
datetime.now()  

datetime.datetime(2020, 5, 10, 14, 50, 48, 870056)

In [39]:
OutpatientClaimsDF = OutpatientClaimsDF.merge(ICD_descDF_outpt, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [40]:
datetime.now()  

datetime.datetime(2020, 5, 10, 14, 50, 58, 282194)

In [41]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

(1583352, 86)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for planned post-operative wound clo...,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,


#### Merging ICD procedure description to outpatient claims DF

In [42]:
datetime.now()  

datetime.datetime(2020, 5, 10, 14, 50, 58, 365368)

In [43]:
OutpatientClaimsDF = OutpatientClaimsDF.merge(ICDProc_descDF_outpt, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [44]:
datetime.now() 

datetime.datetime(2020, 5, 10, 14, 51, 5, 676247)

In [45]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

(1583352, 92)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for planned post-operative wound clo...,,,,,,,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,,,,,,,


#### Merging HCPCS description to outpatient claims DF

In [46]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 5, 743397)

In [47]:
OutpatientClaimsDF = OutpatientClaimsDF.merge(hcpcsDF_outpt, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [48]:
datetime.now() 

datetime.datetime(2020, 5, 10, 14, 51, 21, 434296)

In [49]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

(1583352, 103)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for planned post-operative wound clo...,,,,,,,,,,,,,,,,Prothrombin time,"Assay of psa, total",,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,,,,,,,,Prothrombin time,Metabolic panel total ca,Lipid panel,"Vitamin d, 25 hydroxy","Ther/proph/diag inj, sc/im",Urine bacteria culture,Complete cbc w/auto diff wbc,Hepatic function panel,Assay alkaline phosphatase,,


### Data processing & cleaning for inpatient_claims

In [50]:
q = '''SELECT * FROM inpatient_claims '''

InpatientClaimsDF = query_func(q, conn)
InpatientClaimsDF.head(2)

Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,00016F745862898F,196201177000368,1,2009-04-12,2009-04-18,3900MB,26000.0,0.0,6476809000.0,,,20090412,7866,0.0,1068.0,0.0,0.0,6.0,20090418,201,1970,4019,5853,7843,2768,71590,2724,19889,5849,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [51]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.dtypes

(133267, 81)


DESYNPUF_ID                               object
CLM_ID                                     int64
SEGMENT                                    int64
CLM_FROM_DT                       datetime64[ns]
CLM_THRU_DT                       datetime64[ns]
PRVDR_NUM                                 object
CLM_PMT_AMT                              float64
NCH_PRMRY_PYR_CLM_PD_AMT                 float64
AT_PHYSN_NPI                             float64
OP_PHYSN_NPI                             float64
OT_PHYSN_NPI                             float64
CLM_ADMSN_DT                               int64
ADMTNG_ICD9_DGNS_CD                       object
CLM_PASS_THRU_PER_DIEM_AMT               float64
NCH_BENE_IP_DDCTBL_AMT                   float64
NCH_BENE_PTA_COINSRNC_LBLTY_AM           float64
NCH_BENE_BLOOD_DDCTBL_LBLTY_AM           float64
CLM_UTLZTN_DAY_CNT                       float64
NCH_BENE_DSCHRG_DT                         int64
CLM_DRG_CD                                object
ICD9_DGNS_CD_1      

#### Adding lookup for 10 Diagnostic codes for inpatient claims

In [52]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 27, 762330)

In [53]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc,
            ICD9D9.long_desc as DGNS_CD_9_desc,
            ICD9D10.long_desc as DGNS_CD_10_desc
                 
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN icd9_diagonsis as ICD9D1 ON IPD."ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON IPD."ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON IPD."ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON IPD."ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON IPD."ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON IPD."ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON IPD."ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON IPD."ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D9 ON IPD."ICD9_DGNS_CD_9" = ICD9D9.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D10 ON IPD."ICD9_DGNS_CD_10" = ICD9D10.diagnosis_cd
            ;
    '''

ICD_descDF_inpt = query_func(q, conn)
ICD_descDF_inpt.head(1)

Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,


In [54]:
print(ICD_descDF_inpt.shape)
ICD_descDF_inpt.head(1)

(133267, 11)


Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,


In [55]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 29, 703040)

#### Procedure code lookup for inpatient claims

In [56]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 29, 709072)

In [57]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            ICD9P1.long_desc as PRCDR_CD_1_desc,
            ICD9P2.long_desc as PRCDR_CD_2_desc,
            ICD9P3.long_desc as PRCDR_CD_3_desc,
            ICD9P4.long_desc as PRCDR_CD_4_desc,
            ICD9P5.long_desc as PRCDR_CD_5_desc,
            ICD9P6.long_desc as PRCDR_CD_6_desc
                 
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN icd9_procedures as ICD9P1 ON CAST(IPD."ICD9_PRCDR_CD_1" as varchar) = CAST(ICD9P1.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P2 ON CAST(IPD."ICD9_PRCDR_CD_2" as varchar) = CAST(ICD9P2.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P3 ON CAST(IPD."ICD9_PRCDR_CD_3" as varchar) = CAST(ICD9P3.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P4 ON CAST(IPD."ICD9_PRCDR_CD_4" as varchar) = CAST(ICD9P4.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P5 ON CAST(IPD."ICD9_PRCDR_CD_5" as varchar) = CAST(ICD9P5.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P6 ON CAST(IPD."ICD9_PRCDR_CD_6" as varchar) = CAST(ICD9P6.procedure_cd as varchar)
            ;
    '''

ICDProc_descDF_inpt = query_func(q, conn)
ICDProc_descDF_inpt.head(2)

Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,,,,,,
1,00016F745862898F,,,,,,


In [58]:
print(ICDProc_descDF_inpt.shape)
ICDProc_descDF_inpt.head(1)

(133267, 7)


Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,,,,,,


In [59]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 30, 384982)

#### HCPCS code lookup for inpatient claims

In [60]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 30, 392586)

In [61]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            h1."DESCRIPTION" as hcpcs_CD_1_desc,
            h2."DESCRIPTION" as hcpcs_CD_2_desc,
            h3."DESCRIPTION" as hcpcs_CD_3_desc,
            h4."DESCRIPTION" as hcpcs_CD_4_desc,
            h5."DESCRIPTION" as hcpcs_CD_5_desc,
            h6."DESCRIPTION" as hcpcs_CD_6_desc,
            h7."DESCRIPTION" as hcpcs_CD_7_desc,
            h8."DESCRIPTION" as hcpcs_CD_8_desc,
            h9."DESCRIPTION" as hcpcs_CD_9_desc,
            h10."DESCRIPTION" as hcpcs_CD_10_desc,
            h11."DESCRIPTION" as hcpcs_CD_11_desc
            
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN hcpcs as h1 ON CAST(IPD."HCPCS_CD_1" as varchar) = CAST(h1."HCPCS" as varchar)
            LEFT JOIN hcpcs as h2 ON CAST(IPD."HCPCS_CD_2" as varchar) = CAST(h2."HCPCS" as varchar)
            LEFT JOIN hcpcs as h3 ON CAST(IPD."HCPCS_CD_3" as varchar) = CAST(h3."HCPCS" as varchar)
            LEFT JOIN hcpcs as h4 ON CAST(IPD."HCPCS_CD_4" as varchar) = CAST(h4."HCPCS" as varchar)
            LEFT JOIN hcpcs as h5 ON CAST(IPD."HCPCS_CD_5" as varchar) = CAST(h5."HCPCS" as varchar)
            LEFT JOIN hcpcs as h6 ON CAST(IPD."HCPCS_CD_6" as varchar) = CAST(h6."HCPCS" as varchar)
            LEFT JOIN hcpcs as h7 ON CAST(IPD."HCPCS_CD_7" as varchar) = CAST(h7."HCPCS" as varchar)
            LEFT JOIN hcpcs as h8 ON CAST(IPD."HCPCS_CD_8" as varchar) = CAST(h8."HCPCS" as varchar)
            LEFT JOIN hcpcs as h9 ON CAST(IPD."HCPCS_CD_9" as varchar) = CAST(h9."HCPCS" as varchar)
            LEFT JOIN hcpcs as h10 ON CAST(IPD."HCPCS_CD_10" as varchar) = CAST(h10."HCPCS" as varchar)
            LEFT JOIN hcpcs as h11 ON CAST(IPD."HCPCS_CD_11" as varchar) = CAST(h11."HCPCS" as varchar)

            ;
    '''

hcpcsDF_inpt = query_func(q, conn)
hcpcsDF_inpt.head(2)

Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,,,,,,,,,,,
1,00016F745862898F,,,,,,,,,,,


In [62]:
print(hcpcsDF_inpt.shape)
hcpcsDF_inpt.head(1)

(133267, 12)


Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,,,,,,,,,,,


In [63]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 31, 33616)

#### Merging ICD diagnostic description to inpatient claims DF

In [64]:
# checking Indexes 

InpatientClaimsDF.index, ICD_descDF_inpt.index, ICDProc_descDF_inpt.index, hcpcsDF_inpt.index

(RangeIndex(start=0, stop=133267, step=1),
 RangeIndex(start=0, stop=133267, step=1),
 RangeIndex(start=0, stop=133267, step=1),
 RangeIndex(start=0, stop=133267, step=1))

In [65]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 31, 44772)

In [66]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

(133267, 81)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [67]:
InpatientClaimsDF = InpatientClaimsDF.merge(ICD_descDF_inpt, on='DESYNPUF_ID',
                                            how='inner', left_index=True, right_index=True)

In [68]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 31, 505217)

In [69]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

(133267, 91)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,


#### Merging ICD Procedures description to inpatient claims DF

In [70]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 31, 564099)

In [71]:
InpatientClaimsDF = InpatientClaimsDF.merge(ICDProc_descDF_inpt, on='DESYNPUF_ID',
                                            how='inner', left_index=True, right_index=True)

In [72]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 32, 88727)

In [73]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

(133267, 97)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,,,,,,,


#### Merging HCPCS description to inpatient claims DF

In [74]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 32, 155052)

In [75]:
InpatientClaimsDF = InpatientClaimsDF.merge(hcpcsDF_inpt, on='DESYNPUF_ID',
                                            how='inner', left_index=True, right_index=True)

In [76]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 51, 32, 644363)

In [77]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

(133267, 108)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,,,,,,,,,,,,,,,,,,


### Data Cleaning for prescription_drug_events table (date time format)

In [78]:
q = '''SELECT * FROM prescription_drug_events '''

drug_eventsDF = query_func(q, conn)
drug_eventsDF.head(1) 

Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT
0,00013D2EFD8E45D1,233664490397622,2008-01-03,247037252,30.0,20,10.0,120.0


In [79]:
print(drug_eventsDF.shape)
drug_eventsDF.dtypes

(11113575, 8)


DESYNPUF_ID               object
PDE_ID                     int64
SRVC_DT           datetime64[ns]
PROD_SRVC_ID              object
QTY_DSPNSD_NUM           float64
DAYS_SUPLY_NUM             int64
PTNT_PAY_AMT             float64
TOT_RX_CST_AMT           float64
dtype: object

### Adding these DataFrames to the Database (overwriting)

In [80]:
# create an sqlalchemy connection
conn_postgres = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}'
engine = sqlalchemy.engine.create_engine(conn_postgres)

In [81]:
# connect using sqlalchemy
connection = engine.connect()

In [82]:
# commit
connection.execute('commit')

<sqlalchemy.engine.result.ResultProxy at 0x120f13c10>

In [83]:
# close connection
connection.close()

In [84]:
# open a new connection to the database that we created
conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [85]:
# connect using sqlalchemy
engine = sqlalchemy.engine.create_engine(conn_str)

### Loading files to the Database

##### Beneficiary files

In [86]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 52, 58, 540781)

In [87]:
# No changes hence no need to replace them

#Beneficiary2008DF.to_sql(name = 'beneficiary2008', con = engine, if_exists = 'replace', index = False)
#Beneficiary2009DF.to_sql(name = 'beneficiary2009', con = engine, if_exists = 'replace', index = False)
#Beneficiary2010DF.to_sql(name = 'beneficiary2010', con = engine, if_exists = 'replace', index = False) 

In [88]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 52, 58, 554181)

#### ICD lookup files

In [89]:
# No changes hence no need to replace them

#ICD9_DiagonsisDF.to_sql(name = 'icd9_diagonsis', con = engine, if_exists = 'replace', index = False)
#ICD9_ProcedureDF.to_sql(name = 'icd9_procedures', con = engine, if_exists = 'replace', index = False)

#### HCPCS lookup

In [90]:
# No changes hence no need to replace them

#hcpcsDF.to_sql(name = 'hcpcs', con = engine, if_exists = 'replace', index = False)

#### Prescription Events files

In [91]:
# No changes hence no need to replace them

# Caution: large file (0.5GB) - it takes approx 15 mins for this file alone

#drug_eventsDF.to_sql(name = 'prescription_drug_events', con = engine, if_exists = 'replace', index = False)

#### Deleting DataFrames and freeing up RAM

In [92]:
conn.commit()

In [93]:
del [[Beneficiary2008DF, Beneficiary2009DF, Beneficiary2010DF, ICD9_DiagonsisDF, ICD9_ProcedureDF, hcpcsDF, drug_eventsDF]]

#import gc
#gc.collect()

Beneficiary2008DF = pd.DataFrame()
Beneficiary2009DF = pd.DataFrame()
Beneficiary2010DF = pd.DataFrame()

ICD9_DiagonsisDF = pd.DataFrame()
ICD9_ProcedureDF = pd.DataFrame()
hcpcsDF = pd.DataFrame()

drug_eventsDF = pd.DataFrame()

#### Inpatient files

In [94]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 52, 58, 609379)

In [95]:
# Takes approx 2 mins
InpatientClaimsDF.to_sql(name = 'inpatient_claims', con = engine, if_exists = 'replace', index = False)

In [96]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 54, 34, 950431)

In [97]:
conn.commit()

#### Deleting DataFrames and freeing up RAM

In [98]:
del [[InpatientClaimsDF]]

#import gc
#gc.collect()

InpatientClaimsDF = pd.DataFrame()

#### Outpatient file

In [99]:
datetime.now()

datetime.datetime(2020, 5, 10, 14, 54, 35, 140061)

In [None]:
# Should take approx 15 mins to load (when data is for 1 sample folder)

OutpatientClaimsDF.to_sql(name = 'outpatient_claims', con = engine, if_exists = 'replace', index = False)

In [None]:
datetime.now()

In [None]:
conn.commit()

In [None]:
del [OutpatientClaimsDF]
#gc.collect()

OutpatientClaimsDF = pd.DataFrame()

In [6]:
conn.commit()   #engine.commit()

In [7]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 6, 35, 872680)