In [44]:
import pandas as pd
import numpy as np

import time

import sqlalchemy
import psycopg2

from tqdm import tqdm
from datetime import datetime

pd.set_option('display.max_columns', 500)   # to display 500 columns
pd.set_option('display.max_rows', 500) # to display 500 rows

In [45]:
#![](FileStructure.png)
#from IPython.display import Image
#Image(filename='FileStructure.png')

In [46]:
#import psycopg2

# DSN (data source name) format for database connections:  
# [protocol / database  name]://[username]:[password]@[hostname / ip]:[port]/[database name here]

# on your computer you are the user postgres (full administrative access)
db_user = 'postgres'
# if you need a password to access a database, put it here
db_password = ''
# on your computer, use localhost
db_host = 'localhost'
# the default port for postgres is 5432
db_port = 5432
# we want to connect to the northwind database
database =  'cms_claims' # 'cms_medicare_claims'

conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [47]:
# Check Tables in the Database
query = """
SELECT tablename 
FROM pg_catalog.pg_tables 
WHERE schemaname='public'
"""

pd.read_sql(query, con=conn)

Unnamed: 0,tablename
0,carrier_claims
1,hcpcs
2,inpatient_claims
3,outpatient_claims
4,beneficiary2009
5,beneficiary2008
6,beneficiary2010
7,icd9_diagonsis
8,icd9_procedures
9,prescription_drug_events


In [48]:
# to view All tables and columns
''' query = """
SELECT table_name, column_name, data_type, table_schema
FROM information_schema.columns
WHERE table_schema = 'public'
order by table_name
"""
pd.read_sql(query, con=conn) '''

' query = """\nSELECT table_name, column_name, data_type, table_schema\nFROM information_schema.columns\nWHERE table_schema = \'public\'\norder by table_name\n"""\npd.read_sql(query, con=conn) '

#### Function

In [49]:
def query_func(query, conn):
    df = pd.read_sql(query , con=conn)
    return df

### Data processing & cleaning (Datetime conversion) for Benefeciary tables

In [50]:
q = '''SELECT * FROM beneficiary2008 '''

Beneficiary2008DF = query_func(q, conn)

print(Beneficiary2008DF.shape)
Beneficiary2008DF.head(1)

(116352, 32)


Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,SP_CNCR,SP_COPD,SP_DEPRESSN,SP_DIABETES,SP_ISCHMCHT,SP_OSTEOPRS,SP_RA_OA,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00013D2EFD8E45D1,1923-05-01,NaT,1,1,0,26,950,12,12,12,12,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,50.0,10.0,0.0,0.0,0.0,0.0


In [51]:
Beneficiary2008DF.SP_ISCHMCHT.value_counts(normalize=True)  # 1 means have disease, 2 means no disease

2    0.579363
1    0.420637
Name: SP_ISCHMCHT, dtype: float64

In [52]:
# working with date time conversion
print(type(Beneficiary2008DF['BENE_BIRTH_DT'][7]))
print(type(datetime.strptime(str(Beneficiary2008DF['BENE_BIRTH_DT'][100]), '%Y%m%d')))
print(datetime.strptime(str(Beneficiary2008DF['BENE_BIRTH_DT'][100]), '%Y%m%d'))
datetime.strptime(str(Beneficiary2008DF['BENE_BIRTH_DT'][100]), '%Y%m%d')

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


ValueError: time data '1942-05-01 00:00:00' does not match format '%Y%m%d'

In [None]:
Beneficiary2008DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary2008DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary2008DF['BENE_BIRTH_DT'].head(1)

In [None]:
Beneficiary2008DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary2008DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary2008DF['BENE_DEATH_DT'].head(1)

In [None]:
q = '''SELECT * FROM beneficiary2009 '''

Beneficiary2009DF = query_func(q, conn)

print(Beneficiary2009DF.shape)
Beneficiary2009DF.head(1)

In [None]:
# Converting  int to Datetime format
Beneficiary2009DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary2009DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary2009DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary2009DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')

In [None]:
q = '''SELECT * FROM beneficiary2010 '''

Beneficiary2010DF = query_func(q, conn)

print(Beneficiary2010DF.shape)
Beneficiary2010DF.head(2)

In [None]:
# Converting  int to Datetime format
Beneficiary2010DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary2010DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary2010DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary2010DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')

In [None]:
q = '''SELECT * FROM icd9_diagonsis '''

ICD9_DiagonsisDF = query_func(q, conn)
print(ICD9_DiagonsisDF.shape)
ICD9_DiagonsisDF.head(2)

In [None]:
ICD9_DiagonsisDF.dtypes

In [None]:
q = '''SELECT * FROM icd9_procedures '''

ICD9_ProcedureDF = query_func(q, conn)
print(ICD9_ProcedureDF.shape)
ICD9_ProcedureDF.tail(2)

In [53]:
ICD9_ProcedureDF.dtypes

procedure_cd     int64
long_desc       object
short_desc      object
dtype: object

In [54]:
#ICD9_DiagonsisDF[ICD9_DiagonsisDF.diagnosis_cd.notnull()].head(2)

NameError: name 'ICD9_DiagonsisDF' is not defined

In [55]:
q = '''SELECT * FROM hcpcs '''

hcpcsDF = query_func(q, conn)

print(hcpcsDF.shape)
hcpcsDF.head(2)

(15079, 2)


Unnamed: 0,HCPCS,DESCRIPTION
0,A0021,Outside state ambulance serv
1,A0080,Noninterest escort in non er


In [56]:
hcpcsDF.dtypes

HCPCS          object
DESCRIPTION    object
dtype: object

In [57]:
hcpcsDF.shape

(15079, 2)

### Data processing & cleaning for outpatient_claims

In [24]:
q = '''SELECT * FROM outpatient_claims '''

OutpatientClaimsDF = query_func(q, conn)
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(3)

(790790, 86)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for planned post-operative wound clo...,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,
2,00016F745862898F,542282281644416,1,2009-06-23,2009-06-23,3939PG,30.0,0.0,5737808000.0,,5737808000.0,0.0,9594,E9174,4019.0,,,,,,,,,,,,,,0.0,70.0,,71101,78480,94060.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Hand, except finger injury",Striking against or struck accidentally by oth...,Unspecified essential hypertension,,,,,,,


In [None]:
OutpatientClaimsDF[OutpatientClaimsDF.ICD9_PRCDR_CD_1.notnull()].head(1).index
#OutpatientClaimsDF.ICD9_PRCDR_CD_1.dtypes

In [None]:
#OutpatientClaimsDF['ICD9_PRCDR_CD_1'] = OutpatientClaimsDF['ICD9_PRCDR_CD_1'].replace(np.nan, 0)
# Now converting to int

In [None]:
# Working with date time conversion - Debug code

print(type(OutpatientClaimsDF['CLM_FROM_DT'][7]))
OutpatientClaimsDF['CLM_FROM_DT'][7]

# To get rid of .0 in the end lets convert it to int
print(type(int(OutpatientClaimsDF['CLM_FROM_DT'][7])))
int(OutpatientClaimsDF['CLM_FROM_DT'][7])

print(type(datetime.strptime(str(int(OutpatientClaimsDF['CLM_FROM_DT'][100])), '%Y%m%d')))
print(datetime.strptime(str(int(OutpatientClaimsDF['CLM_FROM_DT'][100])), '%Y%m%d'))

In [None]:
# Converting to Datetime format

datetime.strptime(str(int(OutpatientClaimsDF['CLM_FROM_DT'][100])), '%Y%m%d')
OutpatientClaimsDF['CLM_FROM_DT'] = pd.to_datetime(OutpatientClaimsDF['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
OutpatientClaimsDF['CLM_THRU_DT'] = pd.to_datetime(OutpatientClaimsDF['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

In [41]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

(790790, 86)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_9_desc,dgns_cd_10_desc
0,00013D2EFD8E45D1,542192281063886,1,2008-09-04,2008-09-04,2600RA,50.0,0.0,4824842000.0,,,0.0,V5841,,,,,,,,,,,,,,,,0.0,10.0,V5883,85610,84153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for planned post-operative wound clo...,,,,,,,,,
1,00016F745862898F,542272281166593,1,2009-06-02,2009-06-02,3901GS,30.0,0.0,2963420000.0,,2963420000.0,0.0,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0.0,0.0,,85610,80048,80061.0,82306.0,96372.0,87088.0,85025.0,80076.0,84075.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Encounter for removal of sutures,Long-term (current) use of anticoagulants,Other and unspecified hyperlipidemia,Profound intellectual disabilities,Long-term (current) use of other medications,Atrial fibrillation,,,,


In [None]:
datetime.now()

#### Adding lookup for 10 Diagnostic codes for outpatient claims

In [None]:
q = '''SELECT  
            OPD."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc,
            ICD9D9.long_desc as DGNS_CD_9_desc,
            ICD9D10.long_desc as DGNS_CD_10_desc
                 
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN icd9_diagonsis as ICD9D1 ON OPD."ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON OPD."ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON OPD."ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON OPD."ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON OPD."ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON OPD."ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON OPD."ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON OPD."ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D9 ON OPD."ICD9_DGNS_CD_9" = ICD9D9.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D10 ON OPD."ICD9_DGNS_CD_10" = ICD9D10.diagnosis_cd
            ;
    '''

ICD_descDF = query_func(q, conn)
ICD_descDF.head(1)

In [None]:
print(ICD_descDF.shape)
ICD_descDF.head(10)

In [None]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(10)

In [None]:
datetime.now()

#### Procedure code lookup for outpatient claims

In [43]:
q = '''SELECT  
            OPD."DESYNPUF_ID", 
            ICD9P1.long_desc as PRCDR_CD_1_desc,
            ICD9P2.long_desc as PRCDR_CD_2_desc,
            ICD9P3.long_desc as PRCDR_CD_3_desc,
            ICD9P4.long_desc as PRCDR_CD_4_desc,
            ICD9P5.long_desc as PRCDR_CD_5_desc,
            ICD9P6.long_desc as PRCDR_CD_6_desc
                 
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN icd9_procedures as ICD9P1 ON CAST(OPD."ICD9_PRCDR_CD_1" as varchar) = CAST(ICD9P1.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P2 ON CAST(OPD."ICD9_PRCDR_CD_2" as varchar) = CAST(ICD9P2.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P3 ON CAST(OPD."ICD9_PRCDR_CD_3" as varchar) = CAST(ICD9P3.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P4 ON CAST(OPD."ICD9_PRCDR_CD_4" as varchar) = CAST(ICD9P4.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P5 ON CAST(OPD."ICD9_PRCDR_CD_5" as varchar) = CAST(ICD9P5.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P6 ON CAST(OPD."ICD9_PRCDR_CD_6" as varchar) = CAST(ICD9P6.procedure_cd as varchar)
            ;
    '''

ICDProc_descDF_outpt = query_func(q, conn)
ICDProc_descDF_outpt.head(2)

Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,5A3F4B7A17890666,Excision of lesion or tissue of cerebral meninges,,,,,
1,61944ED4227DE21A,Other operations on extraocular muscles and te...,,,,,


#### HCPCS code lookup for outpatient claims

In [None]:
datetime.now()

In [None]:
q = '''SELECT  
            OPD."DESYNPUF_ID", 
            h1."DESCRIPTION" as hcpcs_CD_1_desc,
            h2."DESCRIPTION" as hcpcs_CD_2_desc,
            h3."DESCRIPTION" as hcpcs_CD_3_desc,
            h4."DESCRIPTION" as hcpcs_CD_4_desc,
            h5."DESCRIPTION" as hcpcs_CD_5_desc,
            h6."DESCRIPTION" as hcpcs_CD_6_desc,
            h7."DESCRIPTION" as hcpcs_CD_7_desc,
            h8."DESCRIPTION" as hcpcs_CD_8_desc,
            h9."DESCRIPTION" as hcpcs_CD_9_desc,
            h10."DESCRIPTION" as hcpcs_CD_10_desc,
            h11."DESCRIPTION" as hcpcs_CD_11_desc
           
       FROM 
                      outpatient_claims as OPD
            LEFT JOIN hcpcs as h1 ON OPD."HCPCS_CD_1" = h1."HCPCS"
            LEFT JOIN hcpcs as h2 ON OPD."HCPCS_CD_2" = h2."HCPCS"
            LEFT JOIN hcpcs as h3 ON OPD."HCPCS_CD_3" = h3."HCPCS"
            LEFT JOIN hcpcs as h4 ON OPD."HCPCS_CD_4" = h4."HCPCS"
            LEFT JOIN hcpcs as h5 ON OPD."HCPCS_CD_5" = h5."HCPCS"
            LEFT JOIN hcpcs as h6 ON OPD."HCPCS_CD_6" = h6."HCPCS"
            LEFT JOIN hcpcs as h7 ON OPD."HCPCS_CD_7" = h7."HCPCS"
            LEFT JOIN hcpcs as h8 ON OPD."HCPCS_CD_8" = h8."HCPCS"
            LEFT JOIN hcpcs as h9 ON OPD."HCPCS_CD_9" = h9."HCPCS"
            LEFT JOIN hcpcs as h10 ON OPD."HCPCS_CD_10" = h10."HCPCS"
            LEFT JOIN hcpcs as h11 ON OPD."HCPCS_CD_11" = h11."HCPCS"
           
            ;
    '''

hcpcsDF = query_func(q, conn)
hcpcsDF.head(10)

In [None]:
#hcpcsDF.head(15)

#### Merging ICD diagnostic description to outpatient claims DF

In [None]:
OutpatientClaimsDF.index

In [None]:
ICD_descDF.index

In [None]:
#OutpatientClaimsDF = pd.merge(OutpatientClaimsDF, ICD_descDF, on='DESYNPUF_ID')

In [None]:
OutpatientClaimsDF = OutpatientClaimsDF.merge(ICD_descDF, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [None]:
datetime.now()  

In [None]:
print(OutpatientClaimsDF.shape)
OutpatientClaimsDF.head(2)

### Data processing & cleaning for inpatient_claims

In [25]:
q = '''SELECT * FROM inpatient_claims '''

InpatientClaimsDF = query_func(q, conn)
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(2)

(178561, 101)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc_x,dgns_cd_2_desc_x,dgns_cd_3_desc_x,dgns_cd_4_desc_x,dgns_cd_5_desc_x,dgns_cd_6_desc_x,dgns_cd_7_desc_x,dgns_cd_8_desc_x,dgns_cd_9_desc_x,dgns_cd_10_desc_x,dgns_cd_1_desc_y,dgns_cd_2_desc_y,dgns_cd_3_desc_y,dgns_cd_4_desc_y,dgns_cd_5_desc_y,dgns_cd_6_desc_y,dgns_cd_7_desc_y,dgns_cd_8_desc_y,dgns_cd_9_desc_y,dgns_cd_10_desc_y
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,
1,00016F745862898F,196201177000368,1,2009-04-12,2009-04-18,3900MB,26000.0,0.0,6476809000.0,,,20090412,7866,0.0,1068.0,0.0,0.0,6.0,20090418,201,1970,4019,5853,7843,2768,71590,2724,19889,5849,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Secondary malignant neoplasm of lung,Unspecified essential hypertension,"Chronic kidney disease, Stage III (moderate)",Aphasia,Hypopotassemia,"Osteoarthrosis, unspecified whether generalize...",Other and unspecified hyperlipidemia,Secondary malignant neoplasm of other specifie...,"Acute kidney failure, unspecified",,Secondary malignant neoplasm of lung,Unspecified essential hypertension,"Chronic kidney disease, Stage III (moderate)",Aphasia,Hypopotassemia,"Osteoarthrosis, unspecified whether generalize...",Other and unspecified hyperlipidemia,Secondary malignant neoplasm of other specifie...,"Acute kidney failure, unspecified",


In [26]:
# Converting to Datetime format

#datetime.strptime(str(int(InpatientClaimsDF['CLM_FROM_DT'][100])), '%Y%m%d')
InpatientClaimsDF['CLM_FROM_DT'] = pd.to_datetime(InpatientClaimsDF['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
InpatientClaimsDF['CLM_THRU_DT'] = pd.to_datetime(InpatientClaimsDF['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

In [10]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(2)

(178561, 101)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,OT_PHYSN_NPI,CLM_ADMSN_DT,ADMTNG_ICD9_DGNS_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,CLM_UTLZTN_DAY_CNT,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,dgns_cd_1_desc_x,dgns_cd_2_desc_x,dgns_cd_3_desc_x,dgns_cd_4_desc_x,dgns_cd_5_desc_x,dgns_cd_6_desc_x,dgns_cd_7_desc_x,dgns_cd_8_desc_x,dgns_cd_9_desc_x,dgns_cd_10_desc_x,dgns_cd_1_desc_y,dgns_cd_2_desc_y,dgns_cd_3_desc_y,dgns_cd_4_desc_y,dgns_cd_5_desc_y,dgns_cd_6_desc_y,dgns_cd_7_desc_y,dgns_cd_8_desc_y,dgns_cd_9_desc_y,dgns_cd_10_desc_y
0,00013D2EFD8E45D1,196661176988405,1,2010-03-12,2010-03-13,2600GD,4000.0,0.0,3139084000.0,,,20100312,4580,0.0,1100.0,0.0,0.0,1.0,20100313,217,7802,78820,V4501,4280,2720,4019,V4502,73300,E9330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,,Syncope and collapse,"Retention of urine, unspecified",Cardiac pacemaker in situ,"Congestive heart failure, unspecified",Pure hypercholesterolemia,Unspecified essential hypertension,Automatic implantable cardiac defibrillator in...,"Osteoporosis, unspecified",Antiallergic and antiemetic drugs causing adve...,
1,00016F745862898F,196201177000368,1,2009-04-12,2009-04-18,3900MB,26000.0,0.0,6476809000.0,,,20090412,7866,0.0,1068.0,0.0,0.0,6.0,20090418,201,1970,4019,5853,7843,2768,71590,2724,19889,5849,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Secondary malignant neoplasm of lung,Unspecified essential hypertension,"Chronic kidney disease, Stage III (moderate)",Aphasia,Hypopotassemia,"Osteoarthrosis, unspecified whether generalize...",Other and unspecified hyperlipidemia,Secondary malignant neoplasm of other specifie...,"Acute kidney failure, unspecified",,Secondary malignant neoplasm of lung,Unspecified essential hypertension,"Chronic kidney disease, Stage III (moderate)",Aphasia,Hypopotassemia,"Osteoarthrosis, unspecified whether generalize...",Other and unspecified hyperlipidemia,Secondary malignant neoplasm of other specifie...,"Acute kidney failure, unspecified",


#### Adding lookup for 10 Diagnostic codes for inpatient claims

In [None]:
datetime.now()

In [None]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc,
            ICD9D9.long_desc as DGNS_CD_9_desc,
            ICD9D10.long_desc as DGNS_CD_10_desc
                 
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN icd9_diagonsis as ICD9D1 ON IPD."ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON IPD."ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON IPD."ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON IPD."ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON IPD."ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON IPD."ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON IPD."ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON IPD."ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D9 ON IPD."ICD9_DGNS_CD_9" = ICD9D9.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D10 ON IPD."ICD9_DGNS_CD_10" = ICD9D10.diagnosis_cd
            ;
    '''

ICD_descDF_inpt = query_func(q, conn)
ICD_descDF_inpt.head(1)

In [None]:
datetime.now()

#### Procedure code lookup for inpatient claims

In [34]:
InpatientClaimsDF.ICD9_PRCDR_CD_1.dtype

dtype('float64')

In [39]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            ICD9P1.long_desc as PRCDR_CD_1_desc,
            ICD9P2.long_desc as PRCDR_CD_2_desc,
            ICD9P3.long_desc as PRCDR_CD_3_desc,
            ICD9P4.long_desc as PRCDR_CD_4_desc,
            ICD9P5.long_desc as PRCDR_CD_5_desc,
            ICD9P6.long_desc as PRCDR_CD_6_desc
                 
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN icd9_procedures as ICD9P1 ON CAST(IPD."ICD9_PRCDR_CD_1" as varchar) = CAST(ICD9P1.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P2 ON CAST(IPD."ICD9_PRCDR_CD_2" as varchar) = CAST(ICD9P2.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P3 ON CAST(IPD."ICD9_PRCDR_CD_3" as varchar) = CAST(ICD9P3.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P4 ON CAST(IPD."ICD9_PRCDR_CD_4" as varchar) = CAST(ICD9P4.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P5 ON CAST(IPD."ICD9_PRCDR_CD_5" as varchar) = CAST(ICD9P5.procedure_cd as varchar)
            LEFT JOIN icd9_procedures as ICD9P6 ON CAST(IPD."ICD9_PRCDR_CD_6" as varchar) = CAST(ICD9P6.procedure_cd as varchar)
            ;
    '''

ICDProc_descDF_inpt = query_func(q, conn)
ICDProc_descDF_inpt.head(2)

Unnamed: 0,DESYNPUF_ID,prcdr_cd_1_desc,prcdr_cd_2_desc,prcdr_cd_3_desc,prcdr_cd_4_desc,prcdr_cd_5_desc,prcdr_cd_6_desc
0,DAB0D77EFB966CD2,Cisternal puncture,Other diagnostic procedures on lymphatic struc...,,,,
1,DAB0D77EFB966CD2,Other incision of conjunctiva,Other diagnostic procedures on lymphatic struc...,,,,


#### HCPCS code lookup for inpatient claims

In [11]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 6, 53, 53551)

In [12]:
q = '''SELECT  
            IPD."DESYNPUF_ID", 
            h1."DESCRIPTION" as hcpcs_CD_1_desc,
            h2."DESCRIPTION" as hcpcs_CD_2_desc,
            h3."DESCRIPTION" as hcpcs_CD_3_desc,
            h4."DESCRIPTION" as hcpcs_CD_4_desc,
            h5."DESCRIPTION" as hcpcs_CD_5_desc,
            h6."DESCRIPTION" as hcpcs_CD_6_desc,
            h7."DESCRIPTION" as hcpcs_CD_7_desc,
            h8."DESCRIPTION" as hcpcs_CD_8_desc,
            h9."DESCRIPTION" as hcpcs_CD_9_desc,
            h10."DESCRIPTION" as hcpcs_CD_10_desc
       FROM 
                      inpatient_claims as IPD
            LEFT JOIN hcpcs as h1 ON IPD."HCPCS_CD_1" = h1."HCPCS"
            LEFT JOIN hcpcs as h2 ON IPD."HCPCS_CD_2" = h2."HCPCS"
            LEFT JOIN hcpcs as h3 ON IPD."HCPCS_CD_3" = h3."HCPCS"
            LEFT JOIN hcpcs as h4 ON IPD."HCPCS_CD_4" = h4."HCPCS"
            LEFT JOIN hcpcs as h5 ON IPD."HCPCS_CD_5" = h5."HCPCS"
            LEFT JOIN hcpcs as h6 ON IPD."HCPCS_CD_6" = h6."HCPCS"
            LEFT JOIN hcpcs as h7 ON IPD."HCPCS_CD_7" = h7."HCPCS"
            LEFT JOIN hcpcs as h8 ON IPD."HCPCS_CD_8" = h8."HCPCS"
            LEFT JOIN hcpcs as h9 ON IPD."HCPCS_CD_9" = h9."HCPCS"
            LEFT JOIN hcpcs as h10 ON IPD."HCPCS_CD_10" = h10."HCPCS"
            LEFT JOIN hcpcs as h11 ON IPD."HCPCS_CD_11" = h11."HCPCS"

            ;
    '''

hcpcsDF_inpt = query_func(q, conn)
hcpcsDF_inpt.head(5)

Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc
0,00013D2EFD8E45D1,,,,,,,,,,
1,00016F745862898F,,,,,,,,,,
2,00016F745862898F,,,,,,,,,,
3,00016F745862898F,,,,,,,,,,
4,00016F745862898F,,,,,,,,,,


In [None]:
print(hcpcsDF_inpt.shape)
hcpcsDF_inpt.head(1)

In [None]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

#### Merging ICD diagnostic description to inpatient claims DF

In [None]:
datetime.now()

In [None]:
#InpatientClaimsDF = pd.merge(InpatientClaimsDF, ICD_descDF_inpt, on='DESYNPUF_ID')

In [None]:
InpatientClaimsDF = InpatientClaimsDF.merge(ICD_descDF_inpt, on='DESYNPUF_ID',
                                            how='inner', left_index=True, right_index=True)

In [None]:
datetime.now()

In [None]:
print(InpatientClaimsDF.shape)
InpatientClaimsDF.head(1)

### Data Cleaning for prescription_drug_events table (date time format)

In [14]:
q = '''SELECT * FROM prescription_drug_events '''

drug_eventsDF = query_func(q, conn)
print(drug_eventsDF.shape)
drug_eventsDF.head(1) 

(5552421, 8)


Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT
0,00013D2EFD8E45D1,233664490397622,2008-01-03,247037252,30.0,20,10.0,120.0


In [15]:
drug_eventsDF['SRVC_DT'] = pd.to_datetime(drug_eventsDF['SRVC_DT'], format='%Y%m%d', errors='coerce')

In [16]:
print(drug_eventsDF.shape)
drug_eventsDF.head(1) 

(5552421, 8)


Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT
0,00013D2EFD8E45D1,233664490397622,2008-01-03,247037252,30.0,20,10.0,120.0


### Adding these DataFrames to the Database (overwriting)

In [None]:
# create an sqlalchemy connection
conn_postgres = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}'
engine = sqlalchemy.engine.create_engine(conn_postgres)

In [None]:
# connect using sqlalchemy
connection = engine.connect()

In [None]:
# commit
connection.execute('commit')

In [None]:
# close connection
connection.close()

In [None]:
# open a new connection to the database that we created
conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [None]:
# connect using sqlalchemy
engine = sqlalchemy.engine.create_engine(conn_str)

#### Loading files to DataBase

##### Beneficiary files

In [None]:
datetime.now()

In [None]:
Beneficiary2008DF.to_sql(name = 'beneficiary2008', con = engine, if_exists = 'replace', index = False)
Beneficiary2009DF.to_sql(name = 'beneficiary2009', con = engine, if_exists = 'replace', index = False)
Beneficiary2010DF.to_sql(name = 'beneficiary2010', con = engine, if_exists = 'replace', index = False) 

In [None]:
datetime.now()

#### ICD lookup files

In [None]:
ICD9_DiagonsisDF.to_sql(name = 'icd9_diagonsis', con = engine, if_exists = 'replace', index = False)
ICD9_ProcedureDF.to_sql(name = 'icd9_procedures', con = engine, if_exists = 'replace', index = False)

In [None]:
datetime.now()

#### Inpatient files

In [None]:
InpatientClaimsDF.to_sql(name = 'inpatient_claims', con = engine, if_exists = 'replace', index = False)

In [None]:
datetime.now()

#### Deleting DataFrames and freeing up RAM

In [None]:
del [[InpatientClaimsDF, Beneficiary2008DF, Beneficiary2009DF, Beneficiary2010DF]]

import gc
gc.collect()

InpatientClaimsDF = pd.DataFrame()
Beneficiary2008DF = pd.DataFrame()
Beneficiary2009DF = pd.DataFrame()
Beneficiary2010DF = pd.DataFrame()

In [None]:
conn.commit()   #engine.commit()

#### Prescription Events files

In [None]:
datetime.now()

In [None]:
# Caution: large file (0.5GB) - it takes approx 15 mins for this file alone
drug_eventsDF.to_sql(name = 'prescription_drug_events', con = engine, if_exists = 'replace', index = False)

In [None]:
datetime.now()

In [None]:
#### Deleting DataFrames and freeing up RAM

In [None]:
del [drug_eventsDF]
gc.collect()

drug_eventsDF = pd.DataFrame()

#### Outpatient file

In [None]:
datetime.now()

In [None]:
# Should take approx 15 mins to load
OutpatientClaimsDF.to_sql(name = 'outpatient_claims', con = engine, if_exists = 'replace', index = False)

In [None]:
datetime.now()

In [None]:
del [OutpatientClaimsDF]
gc.collect()

OutpatientClaimsDF = pd.DataFrame()

In [None]:
conn.commit()   #engine.commit()

## ----------------------- To delete from here --------------------------------------------

#### Finding who died in 2010

In [None]:
aa for error and stopping 

In [None]:
died2010 = pd.DataFrame()
died2010[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2010DF[Beneficiary2010DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2010.shape)
died2010.head(2)

#### Finding who died in 2009

In [None]:
died2009 = pd.DataFrame()
died2009[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2009DF[Beneficiary2009DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2009.shape)
died2009.head(2)

#### Finding who died in 2008

In [None]:
died2008 = pd.DataFrame()
died2008[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2008DF[Beneficiary2008DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2008.shape)
died2008.head(2)

In [None]:
# Picking 1 dead patient from 2010 and looking for his/her history in inpatient and outpatient files

#died2010.DESYNPUF_ID

In [None]:
# Finding Inpatient and outpatuient activies of deads

In [None]:
set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2010.DESYNPUF_ID))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == '00016F745862898F'] #died2010.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2010.DESYNPUF_ID))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == '00016F745862898F']#died2010.DESYNPUF_ID[OutpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID)))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == died2009.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID)))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == died2009.DESYNPUF_ID[OutpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2008.DESYNPUF_ID)))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == died2008.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2008.DESYNPUF_ID)))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == died2008.DESYNPUF_ID[OutpatientClaimsDF.index]]

In [None]:
# checking if patients who died in 2009 are also in 2008 beneficiary list

print(len(set(Beneficiary2008DF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID))))
bene2008_died2009 = set(Beneficiary2008DF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID))

In [None]:
list(bene2008_died2009)[0:5]

In [None]:
# NOW FIND NUMBER OF INPTS & OUTPTS VISITS AND DIAG CODE IN 2008 AND 2009 FOR THESE DEAD PTS IN 2009

In [None]:
# LATER USE THESE TO PREDICT PATIENTS IN 2008 OR 2009 OR 2010, IF THEY WILL DIE OR WILL BE READMITTED OR WILL 
# BE EXPENSIVE IN YEARS TO COME

#### Now checking if patients in 2010 are also in 2009 and 2008 etc

In [None]:
# Any common patients in 2008 and 2009

common2008_2009 = set(Beneficiary2009DF.DESYNPUF_ID).intersection(set(Beneficiary2008DF.DESYNPUF_ID))
len(common2008_2009)

In [None]:
list(common2008_2009)[0:5]

In [None]:
# common patients in 2008 and 2010

set(Beneficiary2010DF.DESYNPUF_ID).intersection(set(Beneficiary2008DF.DESYNPUF_ID))

In [None]:
# common patients in 2009 and 2010

set(Beneficiary2009DF.DESYNPUF_ID).intersection(set(Beneficiary2010DF.DESYNPUF_ID))

In [None]:
# check visits in each year

#### Chekcing Inpatient and outpatient visits of common patients in Year 2008-2009

In [None]:
opd2008 = datetime.strptime(str(OutpatientClaimsDF.CLM_FROM_DT).split('.')[0], '%Y%m%d')
#opd2008.shape

In [None]:
str(OutpatientClaimsDF.CLM_FROM_DT)[5]

In [None]:
# Pt IDs for OPD in 2008
OutpatientID2008 = OutpatientClaimsDF[str(OutpatientClaimsDF.CLM_FROM_DT).split('.')[0], '%Y%m%d').year == 2008][DESYNPUF_ID] 
OutpatientID2008.shape

In [None]:
# Pt IDs for OPD in 2009
OutpatientID2009 OutpatientClaimsDF.DESYNPUF_ID for year 2008

In [None]:
OutpatientID2010

In [None]:
InpatientID2008

In [None]:
InpatientID2009

In [None]:
InpatientID2010

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(common2008_2009)))

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(common2008_2009)))

#### Trying to stitch history for few patienst who died in 2009 but were also beneficiary in 2008

In [None]:
# pt IDs
list(bene2008_died2009)[0:1]

In [None]:
# look for ICD9 etc from beneficiary in 2008 and 2009
# look for inpatient claims in 2008, opd in 2008, inp in 2009, opd in 2009

In [None]:
Beneficiary2008DF.columns

In [None]:
# Looking for patient's diagnosis details in Beneficiary2008DF for pts who died in 2009

In [None]:
Beneficiary2008DF[Beneficiary2008DF.DESYNPUF_ID.isin (list(bene2008_died2009))]['SP_DIABETES']

In [None]:
# Looking for patient's diagnosis details in Beneficiary2009DF for pts who died in 2009

In [None]:
,'SP_CHF', 'SP_ALZHDMTA'
                                                                                  'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
       , 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']]

In [None]:
# Looking for patient's diagnosis details in Beneficiary2010DF for pts who died in 2009 - unlikey but checking

In [None]:
InpatientClaimsDF.head(1)

In [None]:
InpatientClaimsDF.CLM_FROM_DT.year

In [None]:
# claims data by year

In [None]:
Inpatientclaims2008 = InpatientClaimsDF[InpatientClaimsDF.CLM_FROM_DT.year]

In [None]:
print(len(Beneficiary1_2010DF.columns))
Beneficiary1_2010DF.columns

In [None]:
# checking if all the columns in 3 files matches

print(len(set(Beneficiary1_2008DF.columns).intersection(set(Beneficiary1_2009DF.columns))))
set(Beneficiary1_2008DF.columns).intersection(set(Beneficiary1_2009DF.columns))

In [None]:
# Combining all benefiicary DataFrames

Beneficiary1_DF= pd.concat([Beneficiary1_2008DF, Beneficiary1_2009DF, Beneficiary1_2010DF], axis=0)  # on rows
Beneficiary1_DF.shape

In [None]:
Beneficiary1_DF.head(2)

In [None]:
# unique beneficiaries

Beneficiary1_DF.DESYNPUF_ID.nunique()

In [None]:
print(len(Inpatient1_DF.columns))
Inpatient1_DF.columns

In [None]:
print(len(Outpatient1_DF.columns))
Outpatient1_DF.columns

In [None]:
# Checking how many columns in inpateint and outpatient matches

print(len(set(Inpatient1_DF.columns).intersection(set(Outpatient1_DF.columns))))
set(Inpatient1_DF.columns).intersection(set(Outpatient1_DF.columns))

In [None]:
# ASSUMING DESYNPUF_ID FOR INPATIENT AND OUTPATUIENTS ARE UNIQUE - CONFIRM

In [None]:
#check if any DESYNPUF_ID are repeated 

In [None]:
Inpatient1_DF.DESYNPUF_ID.value_counts()

In [None]:
Outpatient1_DF.DESYNPUF_ID.value_counts()

In [None]:
#check if DESYNPUF_ID in inpatient matches in outpatient

In [None]:
print(len(Inpatient1_DF.DESYNPUF_ID))
print(len(Outpatient1_DF.DESYNPUF_ID))
print('There are {} common Ids in inpatient & outpatient'.format(len(set(Outpatient1_DF['DESYNPUF_ID']).intersection(set(Inpatient1_DF['DESYNPUF_ID'])))))

In [None]:
# Combining all DataFrames  # CHECK with Postgres or Tableau if there is data loss
#pd.merge(df_a, df_b, on='subject_id', how='inner')

InpatientDF = pd.merge(Beneficiary1_DF,Inpatient1_DF, on= 'DESYNPUF_ID', how='inner') 
OutpatientDF = pd.merge(Beneficiary1_DF, Outpatient1_DF, on= 'DESYNPUF_ID', how='inner') 

# This is wrong
#Inpatient_temp1 = Beneficiary1_DF.merge(Inpatient1_DF, left_index = True, right_index = True) # Merge is inner join by default
#Outpatient_temp1 = Beneficiary1_DF.merge(Outpatient1_DF, left_index = True, right_index = True) # Merge is inner join by default

print(InpatientDF.shape)
print(OutpatientDF.shape)

In [None]:
# Checking how many columns in inpatient and outpatient matches

#print(len(set(Inpatient_temp1.columns).intersection(set(Outpatient_temp1.columns))))
#set(Inpatient_temp1.columns).intersection(set(Outpatient_temp1.columns))

In [None]:
InpatientDF.dtypes

In [None]:
Inpatient_corr = InpatientDF.corr()
Inpatient_corr

In [None]:
fig, ax = plt.subplots(figsize=(60,36))
matrix = np.triu(Inpatient_corr)
ax = sns.heatmap(Inpatient_corr, annot = True,   center= 0 , mask=matrix,  cmap="BuPu" ,fmt='.1g');#cmap= 'coolwarm',

In [None]:
OutpatientDF.dtypes

In [None]:
Outpatient_corr = OutpatientDF.corr()
Outpatient_corr

In [None]:
fig, ax = plt.subplots(figsize=(60,36))
matrix = np.triu(Outpatient_corr)
ax = sns.heatmap(Outpatient_corr, annot = True,   center= 0 , mask=matrix,  cmap="BuPu" ,fmt='.1g');#cmap= 'coolwarm',

In [None]:
# Save the corr in png to analyse in bigger

In [None]:
# Merging inpatient and outpatient DFs but merging the data on same column name while add remaining columns

In [None]:
InpatientDF.shape

In [None]:
Beneficiary1_DF.shape 

In [None]:
Beneficiary1_DF.DESYNPUF_ID.nunique() # some are both inpatients and outpatients (343701-229163)

In [None]:
# how many deaths

Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique() # for both inpatients and outpatients

In [None]:
Beneficiary_dead_ptsID = Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

#Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['BENE_DEATH_DT']

In [None]:
# how many deaths

#InpatientDF[
print(InpatientDF[InpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique())
Inpatient_dead_ptsID = InpatientDF[InpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

In [None]:
# how many deaths 

#OutpatientDF[
print(OutpatientDF[OutpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique())
Outpatient_dead_ptsID = OutpatientDF[OutpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

In [None]:

Beneficiary1_DF.columns

In [None]:
OutpatientDF.head(2)