In [1]:
import pandas as pd
import numpy as np

import time

import sqlalchemy
import psycopg2

from tqdm import tqdm
from datetime import datetime

pd.set_option('display.max_columns', 500)   # to display 500 columns
pd.set_option('display.max_rows', 500) # to display 500 rows

In [2]:
#![](FileStructure.png)
#from IPython.display import Image
#Image(filename='FileStructure.png')

In [3]:
#import psycopg2

# DSN (data source name) format for database connections:  
# [protocol / database  name]://[username]:[password]@[hostname / ip]:[port]/[database name here]

# on your computer you are the user postgres (full administrative access)
db_user = 'postgres'
# if you need a password to access a database, put it here
db_password = ''
# on your computer, use localhost
db_host = 'localhost'
# the default port for postgres is 5432
db_port = 5432
# we want to connect to the northwind database
database =   'cms_medicare_claims'  # 'cms_claims' #

conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [4]:
# Check Tables in the Database
query = """
SELECT tablename 
FROM pg_catalog.pg_tables 
WHERE schemaname='public'
"""

pd.read_sql(query, con=conn)

Unnamed: 0,tablename
0,carrier_claims
1,icd9_diagonsis
2,icd9_procedures
3,hcpcs
4,beneficiary2008
5,beneficiary2009
6,beneficiary2010
7,inpatient_claims
8,outpatient_claims
9,prescription_drug_events


In [5]:
# to view All tables and columns
''' query = """
SELECT table_name, column_name, data_type, table_schema
FROM information_schema.columns
WHERE table_schema = 'public'
order by table_name
"""
pd.read_sql(query, con=conn) '''

' query = """\nSELECT table_name, column_name, data_type, table_schema\nFROM information_schema.columns\nWHERE table_schema = \'public\'\norder by table_name\n"""\npd.read_sql(query, con=conn) '

#### Function

In [6]:
def query_func(query, conn):
    df = pd.read_sql(query , con=conn)
    return df

### Data processing & cleaning (Datetime conversion) for Claims tables

In [7]:
q = '''SELECT * FROM carrier_claims '''

carrier_claimsDF = query_func(q, conn)
print(carrier_claimsDF.shape)
carrier_claimsDF.head(1) 

(599999, 142)


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,PRF_PHYSN_NPI_1,PRF_PHYSN_NPI_2,PRF_PHYSN_NPI_3,PRF_PHYSN_NPI_4,PRF_PHYSN_NPI_5,PRF_PHYSN_NPI_6,PRF_PHYSN_NPI_7,PRF_PHYSN_NPI_8,PRF_PHYSN_NPI_9,PRF_PHYSN_NPI_10,PRF_PHYSN_NPI_11,PRF_PHYSN_NPI_12,PRF_PHYSN_NPI_13,TAX_NUM_1,TAX_NUM_2,TAX_NUM_3,TAX_NUM_4,TAX_NUM_5,TAX_NUM_6,TAX_NUM_7,TAX_NUM_8,TAX_NUM_9,TAX_NUM_10,TAX_NUM_11,TAX_NUM_12,TAX_NUM_13,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,LINE_NCH_PMT_AMT_1,LINE_NCH_PMT_AMT_2,LINE_NCH_PMT_AMT_3,LINE_NCH_PMT_AMT_4,LINE_NCH_PMT_AMT_5,LINE_NCH_PMT_AMT_6,LINE_NCH_PMT_AMT_7,LINE_NCH_PMT_AMT_8,LINE_NCH_PMT_AMT_9,LINE_NCH_PMT_AMT_10,LINE_NCH_PMT_AMT_11,LINE_NCH_PMT_AMT_12,LINE_NCH_PMT_AMT_13,LINE_BENE_PTB_DDCTBL_AMT_1,LINE_BENE_PTB_DDCTBL_AMT_2,LINE_BENE_PTB_DDCTBL_AMT_3,LINE_BENE_PTB_DDCTBL_AMT_4,LINE_BENE_PTB_DDCTBL_AMT_5,LINE_BENE_PTB_DDCTBL_AMT_6,LINE_BENE_PTB_DDCTBL_AMT_7,LINE_BENE_PTB_DDCTBL_AMT_8,LINE_BENE_PTB_DDCTBL_AMT_9,LINE_BENE_PTB_DDCTBL_AMT_10,LINE_BENE_PTB_DDCTBL_AMT_11,LINE_BENE_PTB_DDCTBL_AMT_12,LINE_BENE_PTB_DDCTBL_AMT_13,LINE_BENE_PRMRY_PYR_PD_AMT_1,LINE_BENE_PRMRY_PYR_PD_AMT_2,LINE_BENE_PRMRY_PYR_PD_AMT_3,LINE_BENE_PRMRY_PYR_PD_AMT_4,LINE_BENE_PRMRY_PYR_PD_AMT_5,LINE_BENE_PRMRY_PYR_PD_AMT_6,LINE_BENE_PRMRY_PYR_PD_AMT_7,LINE_BENE_PRMRY_PYR_PD_AMT_8,LINE_BENE_PRMRY_PYR_PD_AMT_9,LINE_BENE_PRMRY_PYR_PD_AMT_10,LINE_BENE_PRMRY_PYR_PD_AMT_11,LINE_BENE_PRMRY_PYR_PD_AMT_12,LINE_BENE_PRMRY_PYR_PD_AMT_13,LINE_COINSRNC_AMT_1,LINE_COINSRNC_AMT_2,LINE_COINSRNC_AMT_3,LINE_COINSRNC_AMT_4,LINE_COINSRNC_AMT_5,LINE_COINSRNC_AMT_6,LINE_COINSRNC_AMT_7,LINE_COINSRNC_AMT_8,LINE_COINSRNC_AMT_9,LINE_COINSRNC_AMT_10,LINE_COINSRNC_AMT_11,LINE_COINSRNC_AMT_12,LINE_COINSRNC_AMT_13,LINE_ALOWD_CHRG_AMT_1,LINE_ALOWD_CHRG_AMT_2,LINE_ALOWD_CHRG_AMT_3,LINE_ALOWD_CHRG_AMT_4,LINE_ALOWD_CHRG_AMT_5,LINE_ALOWD_CHRG_AMT_6,LINE_ALOWD_CHRG_AMT_7,LINE_ALOWD_CHRG_AMT_8,LINE_ALOWD_CHRG_AMT_9,LINE_ALOWD_CHRG_AMT_10,LINE_ALOWD_CHRG_AMT_11,LINE_ALOWD_CHRG_AMT_12,LINE_ALOWD_CHRG_AMT_13,LINE_PRCSG_IND_CD_1,LINE_PRCSG_IND_CD_2,LINE_PRCSG_IND_CD_3,LINE_PRCSG_IND_CD_4,LINE_PRCSG_IND_CD_5,LINE_PRCSG_IND_CD_6,LINE_PRCSG_IND_CD_7,LINE_PRCSG_IND_CD_8,LINE_PRCSG_IND_CD_9,LINE_PRCSG_IND_CD_10,LINE_PRCSG_IND_CD_11,LINE_PRCSG_IND_CD_12,LINE_PRCSG_IND_CD_13,LINE_ICD9_DGNS_CD_1,LINE_ICD9_DGNS_CD_2,LINE_ICD9_DGNS_CD_3,LINE_ICD9_DGNS_CD_4,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13
0,00013D2EFD8E45D1,887733386680966,20090725,20090725,7245,7244,6272,,,,,,7128675000.0,,,,,,,,,,,,,396635013,,,,,,,,,,,,,97001,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,7242,,,,,,,,,,,,


In [8]:
# Datetime conversion

carrier_claimsDF['CLM_FROM_DT'] = pd.to_datetime(carrier_claimsDF['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
carrier_claimsDF['CLM_THRU_DT'] = pd.to_datetime(carrier_claimsDF['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

#### Adding lookup for 10 Diagnostic codes for carrier  claims

In [9]:
q = '''SELECT  
            CC."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc
                 
       FROM 
                      carrier_claims as CC
            LEFT JOIN icd9_diagonsis as ICD9D1 ON CC."ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON CC."ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON CC."ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON CC."ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON CC."ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON CC."ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON CC."ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON CC."ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd

            ;
    '''

ICD_descDF = query_func(q, conn)
ICD_descDF.head(1)

Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc
0,00013D2EFD8E45D1,"Backache, unspecified",Thoracic or lumbosacral neuritis or radiculiti...,Symptomatic menopausal or female climacteric s...,,,,,


In [10]:
print(ICD_descDF.shape)
ICD_descDF.head(1)

(599999, 9)


Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc
0,00013D2EFD8E45D1,"Backache, unspecified",Thoracic or lumbosacral neuritis or radiculiti...,Symptomatic menopausal or female climacteric s...,,,,,


In [11]:
print(carrier_claimsDF.shape)
carrier_claimsDF.head(1)

(599999, 142)


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,PRF_PHYSN_NPI_1,PRF_PHYSN_NPI_2,PRF_PHYSN_NPI_3,PRF_PHYSN_NPI_4,PRF_PHYSN_NPI_5,PRF_PHYSN_NPI_6,PRF_PHYSN_NPI_7,PRF_PHYSN_NPI_8,PRF_PHYSN_NPI_9,PRF_PHYSN_NPI_10,PRF_PHYSN_NPI_11,PRF_PHYSN_NPI_12,PRF_PHYSN_NPI_13,TAX_NUM_1,TAX_NUM_2,TAX_NUM_3,TAX_NUM_4,TAX_NUM_5,TAX_NUM_6,TAX_NUM_7,TAX_NUM_8,TAX_NUM_9,TAX_NUM_10,TAX_NUM_11,TAX_NUM_12,TAX_NUM_13,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,LINE_NCH_PMT_AMT_1,LINE_NCH_PMT_AMT_2,LINE_NCH_PMT_AMT_3,LINE_NCH_PMT_AMT_4,LINE_NCH_PMT_AMT_5,LINE_NCH_PMT_AMT_6,LINE_NCH_PMT_AMT_7,LINE_NCH_PMT_AMT_8,LINE_NCH_PMT_AMT_9,LINE_NCH_PMT_AMT_10,LINE_NCH_PMT_AMT_11,LINE_NCH_PMT_AMT_12,LINE_NCH_PMT_AMT_13,LINE_BENE_PTB_DDCTBL_AMT_1,LINE_BENE_PTB_DDCTBL_AMT_2,LINE_BENE_PTB_DDCTBL_AMT_3,LINE_BENE_PTB_DDCTBL_AMT_4,LINE_BENE_PTB_DDCTBL_AMT_5,LINE_BENE_PTB_DDCTBL_AMT_6,LINE_BENE_PTB_DDCTBL_AMT_7,LINE_BENE_PTB_DDCTBL_AMT_8,LINE_BENE_PTB_DDCTBL_AMT_9,LINE_BENE_PTB_DDCTBL_AMT_10,LINE_BENE_PTB_DDCTBL_AMT_11,LINE_BENE_PTB_DDCTBL_AMT_12,LINE_BENE_PTB_DDCTBL_AMT_13,LINE_BENE_PRMRY_PYR_PD_AMT_1,LINE_BENE_PRMRY_PYR_PD_AMT_2,LINE_BENE_PRMRY_PYR_PD_AMT_3,LINE_BENE_PRMRY_PYR_PD_AMT_4,LINE_BENE_PRMRY_PYR_PD_AMT_5,LINE_BENE_PRMRY_PYR_PD_AMT_6,LINE_BENE_PRMRY_PYR_PD_AMT_7,LINE_BENE_PRMRY_PYR_PD_AMT_8,LINE_BENE_PRMRY_PYR_PD_AMT_9,LINE_BENE_PRMRY_PYR_PD_AMT_10,LINE_BENE_PRMRY_PYR_PD_AMT_11,LINE_BENE_PRMRY_PYR_PD_AMT_12,LINE_BENE_PRMRY_PYR_PD_AMT_13,LINE_COINSRNC_AMT_1,LINE_COINSRNC_AMT_2,LINE_COINSRNC_AMT_3,LINE_COINSRNC_AMT_4,LINE_COINSRNC_AMT_5,LINE_COINSRNC_AMT_6,LINE_COINSRNC_AMT_7,LINE_COINSRNC_AMT_8,LINE_COINSRNC_AMT_9,LINE_COINSRNC_AMT_10,LINE_COINSRNC_AMT_11,LINE_COINSRNC_AMT_12,LINE_COINSRNC_AMT_13,LINE_ALOWD_CHRG_AMT_1,LINE_ALOWD_CHRG_AMT_2,LINE_ALOWD_CHRG_AMT_3,LINE_ALOWD_CHRG_AMT_4,LINE_ALOWD_CHRG_AMT_5,LINE_ALOWD_CHRG_AMT_6,LINE_ALOWD_CHRG_AMT_7,LINE_ALOWD_CHRG_AMT_8,LINE_ALOWD_CHRG_AMT_9,LINE_ALOWD_CHRG_AMT_10,LINE_ALOWD_CHRG_AMT_11,LINE_ALOWD_CHRG_AMT_12,LINE_ALOWD_CHRG_AMT_13,LINE_PRCSG_IND_CD_1,LINE_PRCSG_IND_CD_2,LINE_PRCSG_IND_CD_3,LINE_PRCSG_IND_CD_4,LINE_PRCSG_IND_CD_5,LINE_PRCSG_IND_CD_6,LINE_PRCSG_IND_CD_7,LINE_PRCSG_IND_CD_8,LINE_PRCSG_IND_CD_9,LINE_PRCSG_IND_CD_10,LINE_PRCSG_IND_CD_11,LINE_PRCSG_IND_CD_12,LINE_PRCSG_IND_CD_13,LINE_ICD9_DGNS_CD_1,LINE_ICD9_DGNS_CD_2,LINE_ICD9_DGNS_CD_3,LINE_ICD9_DGNS_CD_4,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13
0,00013D2EFD8E45D1,887733386680966,2009-07-25,2009-07-25,7245,7244,6272,,,,,,7128675000.0,,,,,,,,,,,,,396635013,,,,,,,,,,,,,97001,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,7242,,,,,,,,,,,,


In [12]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 33, 57, 648538)

#### Adding lookup for Line Diagnosis codes for carrier  claims

In [13]:
q = '''SELECT  
            CC."DESYNPUF_ID", 
            ICD9D1.long_desc as DGNS_CD_1_desc,
            ICD9D2.long_desc as DGNS_CD_2_desc,
            ICD9D3.long_desc as DGNS_CD_3_desc,
            ICD9D4.long_desc as DGNS_CD_4_desc,
            ICD9D5.long_desc as DGNS_CD_5_desc,
            ICD9D6.long_desc as DGNS_CD_6_desc,
            ICD9D7.long_desc as DGNS_CD_7_desc,
            ICD9D8.long_desc as DGNS_CD_8_desc,
            ICD9D9.long_desc as DGNS_CD_8_desc,
            ICD9D10.long_desc as DGNS_CD_8_desc,
            ICD9D11.long_desc as DGNS_CD_8_desc,
            ICD9D12.long_desc as DGNS_CD_8_desc,
            ICD9D13.long_desc as DGNS_CD_8_desc
            
                 
       FROM 
                      carrier_claims as CC
            LEFT JOIN icd9_diagonsis as ICD9D1 ON CC."LINE_ICD9_DGNS_CD_1" = ICD9D1.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D2 ON CC."LINE_ICD9_DGNS_CD_2" = ICD9D2.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D3 ON CC."LINE_ICD9_DGNS_CD_3" = ICD9D3.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D4 ON CC."LINE_ICD9_DGNS_CD_4" = ICD9D4.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D5 ON CC."LINE_ICD9_DGNS_CD_5" = ICD9D5.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D6 ON CC."LINE_ICD9_DGNS_CD_6" = ICD9D6.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D7 ON CC."LINE_ICD9_DGNS_CD_7" = ICD9D7.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D8 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D8.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D9 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D9.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D10 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D10.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D11 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D11.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D12 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D12.diagnosis_cd
            LEFT JOIN icd9_diagonsis as ICD9D13 ON CC."LINE_ICD9_DGNS_CD_8" = ICD9D13.diagnosis_cd
            
            ;
    '''

LineICD_descDF = query_func(q, conn)
LineICD_descDF.head(5)

Unnamed: 0,DESYNPUF_ID,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc,dgns_cd_8_desc.1,dgns_cd_8_desc.2,dgns_cd_8_desc.3,dgns_cd_8_desc.4,dgns_cd_8_desc.5
0,00013D2EFD8E45D1,Lumbago,,,,,,,,,,,,
1,00013D2EFD8E45D1,Other acute postoperative pain,,,,,,,,,,,,
2,00013D2EFD8E45D1,"Bipolar disorder, unspecified",,,,,,,,,,,,
3,00013D2EFD8E45D1,Sprain of neck,,,,,,,,,,,,
4,00013D2EFD8E45D1,"Unspecified schizophrenia, unspecified",,,,,,,,,,,,


In [14]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 34, 16, 659594)

#### Adding lookup for HCPCS codes for carrier  claims

In [15]:
q = '''SELECT  
            CC."DESYNPUF_ID", 
            h1."DESCRIPTION" as hcpcs_CD_1_desc,
            h2."DESCRIPTION" as hcpcs_CD_2_desc,
            h3."DESCRIPTION" as hcpcs_CD_3_desc,
            h4."DESCRIPTION" as hcpcs_CD_4_desc,
            h5."DESCRIPTION" as hcpcs_CD_5_desc,
            h6."DESCRIPTION" as hcpcs_CD_6_desc,
            h7."DESCRIPTION" as hcpcs_CD_7_desc,
            h8."DESCRIPTION" as hcpcs_CD_8_desc,
            h9."DESCRIPTION" as hcpcs_CD_9_desc,
            h10."DESCRIPTION" as hcpcs_CD_10_desc,
            h11."DESCRIPTION" as hcpcs_CD_11_desc
            
       FROM 
                      carrier_claims as CC
            LEFT JOIN hcpcs as h1 ON CC."HCPCS_CD_1" = h1."HCPCS"
            LEFT JOIN hcpcs as h2 ON CC."HCPCS_CD_2" = h2."HCPCS"
            LEFT JOIN hcpcs as h3 ON CC."HCPCS_CD_3" = h3."HCPCS"
            LEFT JOIN hcpcs as h4 ON CC."HCPCS_CD_4" = h4."HCPCS"
            LEFT JOIN hcpcs as h5 ON CC."HCPCS_CD_5" = h5."HCPCS"
            LEFT JOIN hcpcs as h6 ON CC."HCPCS_CD_6" = h6."HCPCS"
            LEFT JOIN hcpcs as h7 ON CC."HCPCS_CD_7" = h7."HCPCS"
            LEFT JOIN hcpcs as h8 ON CC."HCPCS_CD_8" = h8."HCPCS"
            LEFT JOIN hcpcs as h9 ON CC."HCPCS_CD_9" = h9."HCPCS"
            LEFT JOIN hcpcs as h10 ON CC."HCPCS_CD_10" = h10."HCPCS"
            LEFT JOIN hcpcs as h11 ON CC."HCPCS_CD_11" = h11."HCPCS"

            ;
    '''

hcpcsDF_carrier = query_func(q, conn)
hcpcsDF_carrier.head(5)

Unnamed: 0,DESYNPUF_ID,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,Pt evaluation,,,,,,,,,,
1,00013D2EFD8E45D1,Hosp manage cont drug admin,,,,,,,,,,
2,00013D2EFD8E45D1,Medication management,,,,,,,,,,
3,00013D2EFD8E45D1,Self care mngment training,,,,,,,,,,
4,00013D2EFD8E45D1,Complete cbc w/auto diff wbc,,,,,,,,,,


In [16]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 34, 46, 279475)

#### Merging ICD diagnostic description to Carrier claims DF

In [17]:
carrier_claimsDF.index

RangeIndex(start=0, stop=599999, step=1)

In [18]:
ICD_descDF.index

RangeIndex(start=0, stop=599999, step=1)

In [19]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 34, 59, 397845)

In [20]:
carrier_claimsDF = carrier_claimsDF.merge(ICD_descDF, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [21]:
datetime.now()  

datetime.datetime(2020, 4, 30, 15, 35, 3, 124859)

In [22]:
print(carrier_claimsDF.shape)
carrier_claimsDF.head(2)

(599999, 150)


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,PRF_PHYSN_NPI_1,PRF_PHYSN_NPI_2,PRF_PHYSN_NPI_3,PRF_PHYSN_NPI_4,PRF_PHYSN_NPI_5,PRF_PHYSN_NPI_6,PRF_PHYSN_NPI_7,PRF_PHYSN_NPI_8,PRF_PHYSN_NPI_9,PRF_PHYSN_NPI_10,PRF_PHYSN_NPI_11,PRF_PHYSN_NPI_12,PRF_PHYSN_NPI_13,TAX_NUM_1,TAX_NUM_2,TAX_NUM_3,TAX_NUM_4,TAX_NUM_5,TAX_NUM_6,TAX_NUM_7,TAX_NUM_8,TAX_NUM_9,TAX_NUM_10,TAX_NUM_11,TAX_NUM_12,TAX_NUM_13,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,LINE_NCH_PMT_AMT_1,LINE_NCH_PMT_AMT_2,LINE_NCH_PMT_AMT_3,LINE_NCH_PMT_AMT_4,LINE_NCH_PMT_AMT_5,LINE_NCH_PMT_AMT_6,LINE_NCH_PMT_AMT_7,LINE_NCH_PMT_AMT_8,LINE_NCH_PMT_AMT_9,LINE_NCH_PMT_AMT_10,LINE_NCH_PMT_AMT_11,LINE_NCH_PMT_AMT_12,LINE_NCH_PMT_AMT_13,LINE_BENE_PTB_DDCTBL_AMT_1,LINE_BENE_PTB_DDCTBL_AMT_2,LINE_BENE_PTB_DDCTBL_AMT_3,LINE_BENE_PTB_DDCTBL_AMT_4,LINE_BENE_PTB_DDCTBL_AMT_5,LINE_BENE_PTB_DDCTBL_AMT_6,LINE_BENE_PTB_DDCTBL_AMT_7,LINE_BENE_PTB_DDCTBL_AMT_8,LINE_BENE_PTB_DDCTBL_AMT_9,LINE_BENE_PTB_DDCTBL_AMT_10,LINE_BENE_PTB_DDCTBL_AMT_11,LINE_BENE_PTB_DDCTBL_AMT_12,LINE_BENE_PTB_DDCTBL_AMT_13,LINE_BENE_PRMRY_PYR_PD_AMT_1,LINE_BENE_PRMRY_PYR_PD_AMT_2,LINE_BENE_PRMRY_PYR_PD_AMT_3,LINE_BENE_PRMRY_PYR_PD_AMT_4,LINE_BENE_PRMRY_PYR_PD_AMT_5,LINE_BENE_PRMRY_PYR_PD_AMT_6,LINE_BENE_PRMRY_PYR_PD_AMT_7,LINE_BENE_PRMRY_PYR_PD_AMT_8,LINE_BENE_PRMRY_PYR_PD_AMT_9,LINE_BENE_PRMRY_PYR_PD_AMT_10,LINE_BENE_PRMRY_PYR_PD_AMT_11,LINE_BENE_PRMRY_PYR_PD_AMT_12,LINE_BENE_PRMRY_PYR_PD_AMT_13,LINE_COINSRNC_AMT_1,LINE_COINSRNC_AMT_2,LINE_COINSRNC_AMT_3,LINE_COINSRNC_AMT_4,LINE_COINSRNC_AMT_5,LINE_COINSRNC_AMT_6,LINE_COINSRNC_AMT_7,LINE_COINSRNC_AMT_8,LINE_COINSRNC_AMT_9,LINE_COINSRNC_AMT_10,LINE_COINSRNC_AMT_11,LINE_COINSRNC_AMT_12,LINE_COINSRNC_AMT_13,LINE_ALOWD_CHRG_AMT_1,LINE_ALOWD_CHRG_AMT_2,LINE_ALOWD_CHRG_AMT_3,LINE_ALOWD_CHRG_AMT_4,LINE_ALOWD_CHRG_AMT_5,LINE_ALOWD_CHRG_AMT_6,LINE_ALOWD_CHRG_AMT_7,LINE_ALOWD_CHRG_AMT_8,LINE_ALOWD_CHRG_AMT_9,LINE_ALOWD_CHRG_AMT_10,LINE_ALOWD_CHRG_AMT_11,LINE_ALOWD_CHRG_AMT_12,LINE_ALOWD_CHRG_AMT_13,LINE_PRCSG_IND_CD_1,LINE_PRCSG_IND_CD_2,LINE_PRCSG_IND_CD_3,LINE_PRCSG_IND_CD_4,LINE_PRCSG_IND_CD_5,LINE_PRCSG_IND_CD_6,LINE_PRCSG_IND_CD_7,LINE_PRCSG_IND_CD_8,LINE_PRCSG_IND_CD_9,LINE_PRCSG_IND_CD_10,LINE_PRCSG_IND_CD_11,LINE_PRCSG_IND_CD_12,LINE_PRCSG_IND_CD_13,LINE_ICD9_DGNS_CD_1,LINE_ICD9_DGNS_CD_2,LINE_ICD9_DGNS_CD_3,LINE_ICD9_DGNS_CD_4,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13,dgns_cd_1_desc,dgns_cd_2_desc,dgns_cd_3_desc,dgns_cd_4_desc,dgns_cd_5_desc,dgns_cd_6_desc,dgns_cd_7_desc,dgns_cd_8_desc
0,00013D2EFD8E45D1,887733386680966,2009-07-25,2009-07-25,7245,7244,6272.0,,,,,,7128675000.0,,,,,,,,,,,,,396635013,,,,,,,,,,,,,97001,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,7242,,,,,,,,,,,,,"Backache, unspecified",Thoracic or lumbosacral neuritis or radiculiti...,Symptomatic menopausal or female climacteric s...,,,,,
1,00013D2EFD8E45D1,887213386947664,2009-10-14,2009-10-14,3598,27541,,,,,,,9382130000.0,,,,,,,,,,,,,815501822,,,,,,,,,,,,,1996,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,33818,,,,,,,,,,,,,,Hypocalcemia,,,,,,


#### Merging Line ICD diagnostic description to Carrier claims DF

In [23]:
datetime.now()  

datetime.datetime(2020, 4, 30, 15, 35, 8, 29910)

In [24]:
carrier_claimsDF = carrier_claimsDF.merge(LineICD_descDF, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [25]:
datetime.now()  

datetime.datetime(2020, 4, 30, 15, 35, 11, 133082)

In [26]:
print(carrier_claimsDF.shape)
carrier_claimsDF.head(2)

(599999, 163)


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,PRF_PHYSN_NPI_1,PRF_PHYSN_NPI_2,PRF_PHYSN_NPI_3,PRF_PHYSN_NPI_4,PRF_PHYSN_NPI_5,PRF_PHYSN_NPI_6,PRF_PHYSN_NPI_7,PRF_PHYSN_NPI_8,PRF_PHYSN_NPI_9,PRF_PHYSN_NPI_10,PRF_PHYSN_NPI_11,PRF_PHYSN_NPI_12,PRF_PHYSN_NPI_13,TAX_NUM_1,TAX_NUM_2,TAX_NUM_3,TAX_NUM_4,TAX_NUM_5,TAX_NUM_6,TAX_NUM_7,TAX_NUM_8,TAX_NUM_9,TAX_NUM_10,TAX_NUM_11,TAX_NUM_12,TAX_NUM_13,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,LINE_NCH_PMT_AMT_1,LINE_NCH_PMT_AMT_2,LINE_NCH_PMT_AMT_3,LINE_NCH_PMT_AMT_4,LINE_NCH_PMT_AMT_5,LINE_NCH_PMT_AMT_6,LINE_NCH_PMT_AMT_7,LINE_NCH_PMT_AMT_8,LINE_NCH_PMT_AMT_9,LINE_NCH_PMT_AMT_10,LINE_NCH_PMT_AMT_11,LINE_NCH_PMT_AMT_12,LINE_NCH_PMT_AMT_13,LINE_BENE_PTB_DDCTBL_AMT_1,LINE_BENE_PTB_DDCTBL_AMT_2,LINE_BENE_PTB_DDCTBL_AMT_3,LINE_BENE_PTB_DDCTBL_AMT_4,LINE_BENE_PTB_DDCTBL_AMT_5,LINE_BENE_PTB_DDCTBL_AMT_6,LINE_BENE_PTB_DDCTBL_AMT_7,LINE_BENE_PTB_DDCTBL_AMT_8,LINE_BENE_PTB_DDCTBL_AMT_9,LINE_BENE_PTB_DDCTBL_AMT_10,LINE_BENE_PTB_DDCTBL_AMT_11,LINE_BENE_PTB_DDCTBL_AMT_12,LINE_BENE_PTB_DDCTBL_AMT_13,LINE_BENE_PRMRY_PYR_PD_AMT_1,LINE_BENE_PRMRY_PYR_PD_AMT_2,LINE_BENE_PRMRY_PYR_PD_AMT_3,LINE_BENE_PRMRY_PYR_PD_AMT_4,LINE_BENE_PRMRY_PYR_PD_AMT_5,LINE_BENE_PRMRY_PYR_PD_AMT_6,LINE_BENE_PRMRY_PYR_PD_AMT_7,LINE_BENE_PRMRY_PYR_PD_AMT_8,LINE_BENE_PRMRY_PYR_PD_AMT_9,LINE_BENE_PRMRY_PYR_PD_AMT_10,LINE_BENE_PRMRY_PYR_PD_AMT_11,LINE_BENE_PRMRY_PYR_PD_AMT_12,LINE_BENE_PRMRY_PYR_PD_AMT_13,LINE_COINSRNC_AMT_1,LINE_COINSRNC_AMT_2,LINE_COINSRNC_AMT_3,LINE_COINSRNC_AMT_4,LINE_COINSRNC_AMT_5,LINE_COINSRNC_AMT_6,LINE_COINSRNC_AMT_7,LINE_COINSRNC_AMT_8,LINE_COINSRNC_AMT_9,LINE_COINSRNC_AMT_10,LINE_COINSRNC_AMT_11,LINE_COINSRNC_AMT_12,LINE_COINSRNC_AMT_13,LINE_ALOWD_CHRG_AMT_1,LINE_ALOWD_CHRG_AMT_2,LINE_ALOWD_CHRG_AMT_3,LINE_ALOWD_CHRG_AMT_4,LINE_ALOWD_CHRG_AMT_5,LINE_ALOWD_CHRG_AMT_6,LINE_ALOWD_CHRG_AMT_7,LINE_ALOWD_CHRG_AMT_8,LINE_ALOWD_CHRG_AMT_9,LINE_ALOWD_CHRG_AMT_10,LINE_ALOWD_CHRG_AMT_11,LINE_ALOWD_CHRG_AMT_12,LINE_ALOWD_CHRG_AMT_13,LINE_PRCSG_IND_CD_1,LINE_PRCSG_IND_CD_2,LINE_PRCSG_IND_CD_3,LINE_PRCSG_IND_CD_4,LINE_PRCSG_IND_CD_5,LINE_PRCSG_IND_CD_6,LINE_PRCSG_IND_CD_7,LINE_PRCSG_IND_CD_8,LINE_PRCSG_IND_CD_9,LINE_PRCSG_IND_CD_10,LINE_PRCSG_IND_CD_11,LINE_PRCSG_IND_CD_12,LINE_PRCSG_IND_CD_13,LINE_ICD9_DGNS_CD_1,LINE_ICD9_DGNS_CD_2,LINE_ICD9_DGNS_CD_3,LINE_ICD9_DGNS_CD_4,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13,dgns_cd_1_desc_x,dgns_cd_2_desc_x,dgns_cd_3_desc_x,dgns_cd_4_desc_x,dgns_cd_5_desc_x,dgns_cd_6_desc_x,dgns_cd_7_desc_x,dgns_cd_8_desc_x,dgns_cd_1_desc_y,dgns_cd_2_desc_y,dgns_cd_3_desc_y,dgns_cd_4_desc_y,dgns_cd_5_desc_y,dgns_cd_6_desc_y,dgns_cd_7_desc_y,dgns_cd_8_desc_y,dgns_cd_8_desc_y.1,dgns_cd_8_desc_y.2,dgns_cd_8_desc_y.3,dgns_cd_8_desc_y.4,dgns_cd_8_desc_y.5
0,00013D2EFD8E45D1,887733386680966,2009-07-25,2009-07-25,7245,7244,6272.0,,,,,,7128675000.0,,,,,,,,,,,,,396635013,,,,,,,,,,,,,97001,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,7242,,,,,,,,,,,,,"Backache, unspecified",Thoracic or lumbosacral neuritis or radiculiti...,Symptomatic menopausal or female climacteric s...,,,,,,Lumbago,,,,,,,,,,,,
1,00013D2EFD8E45D1,887213386947664,2009-10-14,2009-10-14,3598,27541,,,,,,,9382130000.0,,,,,,,,,,,,,815501822,,,,,,,,,,,,,1996,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,33818,,,,,,,,,,,,,,Hypocalcemia,,,,,,,Other acute postoperative pain,,,,,,,,,,,,


In [None]:
datetime.now()

#### Merging HCPCS description to carrier claims DF

In [27]:
datetime.now()  

datetime.datetime(2020, 4, 30, 15, 35, 18, 993227)

In [28]:
carrier_claimsDF = carrier_claimsDF.merge(hcpcsDF_carrier, on='DESYNPUF_ID', 
                                              how='inner',  left_index=True, right_index=True)

In [29]:
datetime.now()  

datetime.datetime(2020, 4, 30, 15, 35, 43, 837357)

In [30]:
print(carrier_claimsDF.shape)
carrier_claimsDF.head(2)

(599999, 174)


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,PRF_PHYSN_NPI_1,PRF_PHYSN_NPI_2,PRF_PHYSN_NPI_3,PRF_PHYSN_NPI_4,PRF_PHYSN_NPI_5,PRF_PHYSN_NPI_6,PRF_PHYSN_NPI_7,PRF_PHYSN_NPI_8,PRF_PHYSN_NPI_9,PRF_PHYSN_NPI_10,PRF_PHYSN_NPI_11,PRF_PHYSN_NPI_12,PRF_PHYSN_NPI_13,TAX_NUM_1,TAX_NUM_2,TAX_NUM_3,TAX_NUM_4,TAX_NUM_5,TAX_NUM_6,TAX_NUM_7,TAX_NUM_8,TAX_NUM_9,TAX_NUM_10,TAX_NUM_11,TAX_NUM_12,TAX_NUM_13,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,LINE_NCH_PMT_AMT_1,LINE_NCH_PMT_AMT_2,LINE_NCH_PMT_AMT_3,LINE_NCH_PMT_AMT_4,LINE_NCH_PMT_AMT_5,LINE_NCH_PMT_AMT_6,LINE_NCH_PMT_AMT_7,LINE_NCH_PMT_AMT_8,LINE_NCH_PMT_AMT_9,LINE_NCH_PMT_AMT_10,LINE_NCH_PMT_AMT_11,LINE_NCH_PMT_AMT_12,LINE_NCH_PMT_AMT_13,LINE_BENE_PTB_DDCTBL_AMT_1,LINE_BENE_PTB_DDCTBL_AMT_2,LINE_BENE_PTB_DDCTBL_AMT_3,LINE_BENE_PTB_DDCTBL_AMT_4,LINE_BENE_PTB_DDCTBL_AMT_5,LINE_BENE_PTB_DDCTBL_AMT_6,LINE_BENE_PTB_DDCTBL_AMT_7,LINE_BENE_PTB_DDCTBL_AMT_8,LINE_BENE_PTB_DDCTBL_AMT_9,LINE_BENE_PTB_DDCTBL_AMT_10,LINE_BENE_PTB_DDCTBL_AMT_11,LINE_BENE_PTB_DDCTBL_AMT_12,LINE_BENE_PTB_DDCTBL_AMT_13,LINE_BENE_PRMRY_PYR_PD_AMT_1,LINE_BENE_PRMRY_PYR_PD_AMT_2,LINE_BENE_PRMRY_PYR_PD_AMT_3,LINE_BENE_PRMRY_PYR_PD_AMT_4,LINE_BENE_PRMRY_PYR_PD_AMT_5,LINE_BENE_PRMRY_PYR_PD_AMT_6,LINE_BENE_PRMRY_PYR_PD_AMT_7,LINE_BENE_PRMRY_PYR_PD_AMT_8,LINE_BENE_PRMRY_PYR_PD_AMT_9,LINE_BENE_PRMRY_PYR_PD_AMT_10,LINE_BENE_PRMRY_PYR_PD_AMT_11,LINE_BENE_PRMRY_PYR_PD_AMT_12,LINE_BENE_PRMRY_PYR_PD_AMT_13,LINE_COINSRNC_AMT_1,LINE_COINSRNC_AMT_2,LINE_COINSRNC_AMT_3,LINE_COINSRNC_AMT_4,LINE_COINSRNC_AMT_5,LINE_COINSRNC_AMT_6,LINE_COINSRNC_AMT_7,LINE_COINSRNC_AMT_8,LINE_COINSRNC_AMT_9,LINE_COINSRNC_AMT_10,LINE_COINSRNC_AMT_11,LINE_COINSRNC_AMT_12,LINE_COINSRNC_AMT_13,LINE_ALOWD_CHRG_AMT_1,LINE_ALOWD_CHRG_AMT_2,LINE_ALOWD_CHRG_AMT_3,LINE_ALOWD_CHRG_AMT_4,LINE_ALOWD_CHRG_AMT_5,LINE_ALOWD_CHRG_AMT_6,LINE_ALOWD_CHRG_AMT_7,LINE_ALOWD_CHRG_AMT_8,LINE_ALOWD_CHRG_AMT_9,LINE_ALOWD_CHRG_AMT_10,LINE_ALOWD_CHRG_AMT_11,LINE_ALOWD_CHRG_AMT_12,LINE_ALOWD_CHRG_AMT_13,LINE_PRCSG_IND_CD_1,LINE_PRCSG_IND_CD_2,LINE_PRCSG_IND_CD_3,LINE_PRCSG_IND_CD_4,LINE_PRCSG_IND_CD_5,LINE_PRCSG_IND_CD_6,LINE_PRCSG_IND_CD_7,LINE_PRCSG_IND_CD_8,LINE_PRCSG_IND_CD_9,LINE_PRCSG_IND_CD_10,LINE_PRCSG_IND_CD_11,LINE_PRCSG_IND_CD_12,LINE_PRCSG_IND_CD_13,LINE_ICD9_DGNS_CD_1,LINE_ICD9_DGNS_CD_2,LINE_ICD9_DGNS_CD_3,LINE_ICD9_DGNS_CD_4,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13,dgns_cd_1_desc_x,dgns_cd_2_desc_x,dgns_cd_3_desc_x,dgns_cd_4_desc_x,dgns_cd_5_desc_x,dgns_cd_6_desc_x,dgns_cd_7_desc_x,dgns_cd_8_desc_x,dgns_cd_1_desc_y,dgns_cd_2_desc_y,dgns_cd_3_desc_y,dgns_cd_4_desc_y,dgns_cd_5_desc_y,dgns_cd_6_desc_y,dgns_cd_7_desc_y,dgns_cd_8_desc_y,dgns_cd_8_desc_y.1,dgns_cd_8_desc_y.2,dgns_cd_8_desc_y.3,dgns_cd_8_desc_y.4,dgns_cd_8_desc_y.5,hcpcs_cd_1_desc,hcpcs_cd_2_desc,hcpcs_cd_3_desc,hcpcs_cd_4_desc,hcpcs_cd_5_desc,hcpcs_cd_6_desc,hcpcs_cd_7_desc,hcpcs_cd_8_desc,hcpcs_cd_9_desc,hcpcs_cd_10_desc,hcpcs_cd_11_desc
0,00013D2EFD8E45D1,887733386680966,2009-07-25,2009-07-25,7245,7244,6272.0,,,,,,7128675000.0,,,,,,,,,,,,,396635013,,,,,,,,,,,,,97001,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,7242,,,,,,,,,,,,,"Backache, unspecified",Thoracic or lumbosacral neuritis or radiculiti...,Symptomatic menopausal or female climacteric s...,,,,,,Lumbago,,,,,,,,,,,,,Pt evaluation,,,,,,,,,,
1,00013D2EFD8E45D1,887213386947664,2009-10-14,2009-10-14,3598,27541,,,,,,,9382130000.0,,,,,,,,,,,,,815501822,,,,,,,,,,,,,1996,,,,,,,,,,,,,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,,,,,,,,,,,,,33818,,,,,,,,,,,,,,Hypocalcemia,,,,,,,Other acute postoperative pain,,,,,,,,,,,,,Hosp manage cont drug admin,,,,,,,,,,


### Adding these DataFrames to the Database (overwriting)

In [31]:
# create an sqlalchemy connection
conn_postgres = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}'
engine = sqlalchemy.engine.create_engine(conn_postgres)

In [32]:
# connect using sqlalchemy
connection = engine.connect()

In [33]:
# commit
connection.execute('commit')

<sqlalchemy.engine.result.ResultProxy at 0x17457fa90>

In [34]:
# close connection
connection.close()

In [35]:
# open a new connection to the database that we created
conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [36]:
# connect using sqlalchemy
engine = sqlalchemy.engine.create_engine(conn_str)

#### Loading files to DataBase

##### Carrier_claims

In [37]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 35, 57, 507120)

In [38]:
carrier_claimsDF.to_sql(name = 'carrier_claims', con = engine, if_exists = 'replace', index = False) 

In [39]:
datetime.now()

datetime.datetime(2020, 4, 30, 15, 56, 24, 12305)

In [41]:
conn.commit() 

In [42]:
del [carrier_claimsDF]
#gc.collect()

carrier_claimsDF = pd.DataFrame()

## ----------------------- To delete from here --------------------------------------------

#### Finding who died in 2010

In [None]:
aa for error and stopping 

In [None]:
died2010 = pd.DataFrame()
died2010[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2010DF[Beneficiary2010DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2010.shape)
died2010.head(2)

#### Finding who died in 2009

In [None]:
died2009 = pd.DataFrame()
died2009[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2009DF[Beneficiary2009DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2009.shape)
died2009.head(2)

#### Finding who died in 2008

In [None]:
died2008 = pd.DataFrame()
died2008[['DESYNPUF_ID', 'BENE_DEATH_DT']] = Beneficiary2008DF[Beneficiary2008DF.BENE_DEATH_DT.notnull()][['DESYNPUF_ID','BENE_DEATH_DT']]
print(died2008.shape)
died2008.head(2)

In [None]:
# Picking 1 dead patient from 2010 and looking for his/her history in inpatient and outpatient files

#died2010.DESYNPUF_ID

In [None]:
# Finding Inpatient and outpatuient activies of deads

In [None]:
set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2010.DESYNPUF_ID))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == '00016F745862898F'] #died2010.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2010.DESYNPUF_ID))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == '00016F745862898F']#died2010.DESYNPUF_ID[OutpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID)))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == died2009.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID)))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == died2009.DESYNPUF_ID[OutpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(died2008.DESYNPUF_ID)))

#InpatientClaimsDF[InpatientClaimsDF.DESYNPUF_ID == died2008.DESYNPUF_ID[InpatientClaimsDF.DESYNPUF_ID.index]]

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(died2008.DESYNPUF_ID)))

#OutpatientClaimsDF[OutpatientClaimsDF.DESYNPUF_ID == died2008.DESYNPUF_ID[OutpatientClaimsDF.index]]

In [None]:
# checking if patients who died in 2009 are also in 2008 beneficiary list

print(len(set(Beneficiary2008DF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID))))
bene2008_died2009 = set(Beneficiary2008DF.DESYNPUF_ID).intersection(set(died2009.DESYNPUF_ID))

In [None]:
list(bene2008_died2009)[0:5]

In [None]:
# NOW FIND NUMBER OF INPTS & OUTPTS VISITS AND DIAG CODE IN 2008 AND 2009 FOR THESE DEAD PTS IN 2009

In [None]:
# LATER USE THESE TO PREDICT PATIENTS IN 2008 OR 2009 OR 2010, IF THEY WILL DIE OR WILL BE READMITTED OR WILL 
# BE EXPENSIVE IN YEARS TO COME

#### Now checking if patients in 2010 are also in 2009 and 2008 etc

In [None]:
# Any common patients in 2008 and 2009

common2008_2009 = set(Beneficiary2009DF.DESYNPUF_ID).intersection(set(Beneficiary2008DF.DESYNPUF_ID))
len(common2008_2009)

In [None]:
list(common2008_2009)[0:5]

In [None]:
# common patients in 2008 and 2010

set(Beneficiary2010DF.DESYNPUF_ID).intersection(set(Beneficiary2008DF.DESYNPUF_ID))

In [None]:
# common patients in 2009 and 2010

set(Beneficiary2009DF.DESYNPUF_ID).intersection(set(Beneficiary2010DF.DESYNPUF_ID))

In [None]:
# check visits in each year

#### Chekcing Inpatient and outpatient visits of common patients in Year 2008-2009

In [None]:
opd2008 = datetime.strptime(str(OutpatientClaimsDF.CLM_FROM_DT).split('.')[0], '%Y%m%d')
#opd2008.shape

In [None]:
str(OutpatientClaimsDF.CLM_FROM_DT)[5]

In [None]:
# Pt IDs for OPD in 2008
OutpatientID2008 = OutpatientClaimsDF[str(OutpatientClaimsDF.CLM_FROM_DT).split('.')[0], '%Y%m%d').year == 2008][DESYNPUF_ID] 
OutpatientID2008.shape

In [None]:
# Pt IDs for OPD in 2009
OutpatientID2009 OutpatientClaimsDF.DESYNPUF_ID for year 2008

In [None]:
OutpatientID2010

In [None]:
InpatientID2008

In [None]:
InpatientID2009

In [None]:
InpatientID2010

In [None]:
len(set(OutpatientClaimsDF.DESYNPUF_ID).intersection(set(common2008_2009)))

In [None]:
len(set(InpatientClaimsDF.DESYNPUF_ID).intersection(set(common2008_2009)))

#### Trying to stitch history for few patienst who died in 2009 but were also beneficiary in 2008

In [None]:
# pt IDs
list(bene2008_died2009)[0:1]

In [None]:
# look for ICD9 etc from beneficiary in 2008 and 2009
# look for inpatient claims in 2008, opd in 2008, inp in 2009, opd in 2009

In [None]:
Beneficiary2008DF.columns

In [None]:
# Looking for patient's diagnosis details in Beneficiary2008DF for pts who died in 2009

In [None]:
Beneficiary2008DF[Beneficiary2008DF.DESYNPUF_ID.isin (list(bene2008_died2009))]['SP_DIABETES']

In [None]:
# Looking for patient's diagnosis details in Beneficiary2009DF for pts who died in 2009

In [None]:
,'SP_CHF', 'SP_ALZHDMTA'
                                                                                  'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
       , 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']]

In [None]:
# Looking for patient's diagnosis details in Beneficiary2010DF for pts who died in 2009 - unlikey but checking

In [None]:
InpatientClaimsDF.head(1)

In [None]:
InpatientClaimsDF.CLM_FROM_DT.year

In [None]:
# claims data by year

In [None]:
Inpatientclaims2008 = InpatientClaimsDF[InpatientClaimsDF.CLM_FROM_DT.year]

In [None]:
print(len(Beneficiary1_2010DF.columns))
Beneficiary1_2010DF.columns

In [None]:
# checking if all the columns in 3 files matches

print(len(set(Beneficiary1_2008DF.columns).intersection(set(Beneficiary1_2009DF.columns))))
set(Beneficiary1_2008DF.columns).intersection(set(Beneficiary1_2009DF.columns))

In [None]:
# Combining all benefiicary DataFrames

Beneficiary1_DF= pd.concat([Beneficiary1_2008DF, Beneficiary1_2009DF, Beneficiary1_2010DF], axis=0)  # on rows
Beneficiary1_DF.shape

In [None]:
Beneficiary1_DF.head(2)

In [None]:
# unique beneficiaries

Beneficiary1_DF.DESYNPUF_ID.nunique()

In [None]:
print(len(Inpatient1_DF.columns))
Inpatient1_DF.columns

In [None]:
print(len(Outpatient1_DF.columns))
Outpatient1_DF.columns

In [None]:
# Checking how many columns in inpateint and outpatient matches

print(len(set(Inpatient1_DF.columns).intersection(set(Outpatient1_DF.columns))))
set(Inpatient1_DF.columns).intersection(set(Outpatient1_DF.columns))

In [None]:
# ASSUMING DESYNPUF_ID FOR INPATIENT AND OUTPATUIENTS ARE UNIQUE - CONFIRM

In [None]:
#check if any DESYNPUF_ID are repeated 

In [None]:
Inpatient1_DF.DESYNPUF_ID.value_counts()

In [None]:
Outpatient1_DF.DESYNPUF_ID.value_counts()

In [None]:
#check if DESYNPUF_ID in inpatient matches in outpatient

In [None]:
print(len(Inpatient1_DF.DESYNPUF_ID))
print(len(Outpatient1_DF.DESYNPUF_ID))
print('There are {} common Ids in inpatient & outpatient'.format(len(set(Outpatient1_DF['DESYNPUF_ID']).intersection(set(Inpatient1_DF['DESYNPUF_ID'])))))

In [None]:
# Combining all DataFrames  # CHECK with Postgres or Tableau if there is data loss
#pd.merge(df_a, df_b, on='subject_id', how='inner')

InpatientDF = pd.merge(Beneficiary1_DF,Inpatient1_DF, on= 'DESYNPUF_ID', how='inner') 
OutpatientDF = pd.merge(Beneficiary1_DF, Outpatient1_DF, on= 'DESYNPUF_ID', how='inner') 

# This is wrong
#Inpatient_temp1 = Beneficiary1_DF.merge(Inpatient1_DF, left_index = True, right_index = True) # Merge is inner join by default
#Outpatient_temp1 = Beneficiary1_DF.merge(Outpatient1_DF, left_index = True, right_index = True) # Merge is inner join by default

print(InpatientDF.shape)
print(OutpatientDF.shape)

In [None]:
# Checking how many columns in inpatient and outpatient matches

#print(len(set(Inpatient_temp1.columns).intersection(set(Outpatient_temp1.columns))))
#set(Inpatient_temp1.columns).intersection(set(Outpatient_temp1.columns))

In [None]:
InpatientDF.dtypes

In [None]:
Inpatient_corr = InpatientDF.corr()
Inpatient_corr

In [None]:
fig, ax = plt.subplots(figsize=(60,36))
matrix = np.triu(Inpatient_corr)
ax = sns.heatmap(Inpatient_corr, annot = True,   center= 0 , mask=matrix,  cmap="BuPu" ,fmt='.1g');#cmap= 'coolwarm',

In [None]:
OutpatientDF.dtypes

In [None]:
Outpatient_corr = OutpatientDF.corr()
Outpatient_corr

In [None]:
fig, ax = plt.subplots(figsize=(60,36))
matrix = np.triu(Outpatient_corr)
ax = sns.heatmap(Outpatient_corr, annot = True,   center= 0 , mask=matrix,  cmap="BuPu" ,fmt='.1g');#cmap= 'coolwarm',

In [None]:
# Save the corr in png to analyse in bigger

In [None]:
# Merging inpatient and outpatient DFs but merging the data on same column name while add remaining columns

In [None]:
InpatientDF.shape

In [None]:
Beneficiary1_DF.shape 

In [None]:
Beneficiary1_DF.DESYNPUF_ID.nunique() # some are both inpatients and outpatients (343701-229163)

In [None]:
# how many deaths

Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique() # for both inpatients and outpatients

In [None]:
Beneficiary_dead_ptsID = Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

#Beneficiary1_DF[Beneficiary1_DF.BENE_DEATH_DT.notnull()]['BENE_DEATH_DT']

In [None]:
# how many deaths

#InpatientDF[
print(InpatientDF[InpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique())
Inpatient_dead_ptsID = InpatientDF[InpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

In [None]:
# how many deaths 

#OutpatientDF[
print(OutpatientDF[OutpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID'].nunique())
Outpatient_dead_ptsID = OutpatientDF[OutpatientDF.BENE_DEATH_DT.notnull()]['DESYNPUF_ID']

In [None]:

Beneficiary1_DF.columns

In [None]:
OutpatientDF.head(2)