In [3]:
from __future__ import print_function
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')


In [4]:
def join_with_hfref(patient_cohort,inpatient_claims):
    inpatient_claims_joined = pd.merge(inpatient_claims, patient_cohort, on="enrolid", how="left")
    
    return inpatient_claims_joined

In [5]:
def preprocess(inpatient_claims_joined):
    inpatient_dx1 = inpatient_claims_joined.drop(columns=['dx2', 'dx3', 'dx4', 'pdx'])
    inpatient_dx2 = inpatient_claims_joined.drop(columns=['dx1', 'dx3', 'dx4', 'pdx'])
    inpatient_dx3 = inpatient_claims_joined.drop(columns=['dx1', 'dx2', 'dx4', 'pdx'])
    inpatient_dx4 = inpatient_claims_joined.drop(columns=['dx1', 'dx2', 'dx3', 'pdx'])
    inpatient_dx5 = inpatient_claims_joined.drop(columns=['dx1', 'dx2', 'dx3', 'dx4'])
    inpatient_dx1["dx_truncated"] = inpatient_dx1["dx1"].apply(lambda x: truncate_code(x))
    inpatient_dx2["dx_truncated"] = inpatient_dx2["dx2"].apply(lambda x: truncate_code(x))
    inpatient_dx3["dx_truncated"] = inpatient_dx3["dx3"].apply(lambda x: truncate_code(x))
    inpatient_dx4["dx_truncated"] = inpatient_dx4["dx4"].apply(lambda x: truncate_code(x))
    inpatient_dx5["dx_truncated"] = inpatient_dx5["pdx"].apply(lambda x: truncate_code(x))
    inpatient_dx1_select = inpatient_dx1[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    inpatient_dx2_select = inpatient_dx2[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    inpatient_dx3_select = inpatient_dx3[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    inpatient_dx4_select = inpatient_dx4[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    inpatient_dx5_select = inpatient_dx5[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    inpatient_concat = pd.concat([inpatient_dx1_select, inpatient_dx2_select, inpatient_dx3_select,inpatient_dx4_select, inpatient_dx5_select], ignore_index=True)
    inpatient_concat = inpatient_concat[inpatient_concat['dx_truncated']!='nan']
    
    return inpatient_concat
    '''Space for incorporating PROC & NDCNUM'''
    
    
def truncate_code(dx_code):
    new_code = str(dx_code)[0:3]
    
    return new_code    

In [6]:
def calculate_clusters(inpatient_concat):
    ## Cluster dx codes based on truncated code
    inpatient_clustered = inpatient_concat[['enrolid', 'dx_truncated']]
    inpatient_clustered = inpatient_clustered.groupby(['dx_truncated']).count().reset_index()
    inpatient_clustered = inpatient_clustered.sort_values(by='enrolid', ascending=False)
    inpatient_clustered = inpatient_clustered.rename(columns={"dx_truncated": "dx_truncated", "enrolid": "enrolid_count"})
    print (inpatient_clustered.head())
    
    return inpatient_clustered
    #inpatient_clustered = inpatient_clustered.sort_values(by='')
    

In [7]:
def remove_low_frequency(inpatient_concat,inpatient_clustered):
    total_occur = sum(inpatient_clustered['enrolid_count'])
    print ("Total: No of Concepts ", inpatient_clustered.shape[0])
    print ("Total: code occurences:", total_occur)
    inpatient_clustered['occur_rate'] = inpatient_clustered['enrolid_count'].apply(lambda x: float(x/total_occur))
    print (inpatient_clustered.head())
    ##Remove least frequently occurring codes
    inpatient_clustered_final=inpatient_clustered[inpatient_clustered.enrolid_count > inpatient_clustered.enrolid_count.quantile(.30)]
    print (inpatient_clustered_final.head())
    print ("Final total codes: ", inpatient_clustered_final.shape)
    ## Subset original claims df by leftover codes (inpatient concat)
    inpatient_concat_final = inpatient_concat[inpatient_concat['dx_truncated'].isin(inpatient_clustered_final['dx_truncated'])]
    print(inpatient_concat_final.head())
    
    return inpatient_concat_final

In [8]:
def remove_low_variance(inpatient_concat_final):
    #inpatient_group=inpatient_concat_final.groupby(['enrolid','indexdate','dx_truncated'])['age'].count().to_frame().reset_index()
    #inpatient_group=inpatient_group.rename(columns = {'age':'count'})
    inpatient_concat_final['count'] = inpatient_concat_final.groupby(['enrolid','svcdate','dx_truncated'])['age'].transform('count')
    print('Before removing: No of Concepts ',inpatient_concat_final['dx_truncated'].nunique())
    #standard deviation threshold
    threshold = 0.5
    inpatient_pivot = inpatient_concat_final.pivot_table(index=['enrolid', 'svcdate'], columns='dx_truncated', values='count').fillna(0)
    inpatient_pivot.drop(inpatient_pivot.std()[inpatient_pivot.std() < threshold].index.values, axis = 1, inplace = True)
    concepts_final=inpatient_pivot.columns.to_frame()    
    inpatient_concat_final_2 = inpatient_concat_final[inpatient_concat_final['dx_truncated'].isin(concepts_final['dx_truncated'])]
    inpatient_concat_final_2.sort_values('enrolid',inplace=True)
    inpatient_concat_final_2.dx_truncated.nunique()
    print('After removing: No of Concepts ',inpatient_concat_final_2['dx_truncated'].nunique())
    
    return inpatient_concat_final_2

    

In [9]:
def final_table(inpatient_concat_final_2):
    inpatient_concat_final_2['StaticVar'] = inpatient_concat_final_2['sex'].astype(str) + ', ' + \
    inpatient_concat_final_2['age'].astype(str) + ', ' +inpatient_concat_final_2['region'].astype(str) + ', '+ \
    inpatient_concat_final_2['plantyp'].astype(str) + ', ' +inpatient_concat_final_2['datatyp'].astype(str)+ \
                                                           ', '+ inpatient_concat_final_2['rx'].astype(str)

    return inpatient_concat_final_2



In [10]:
def call_to_function(patient_cohort,inpatient_claims):
    inpatient_claims_joined=join_with_hfref(patient_cohort,inpatient_claims)
    inpatient_concat=preprocess(inpatient_claims_joined)
    inpatient_clustered=calculate_clusters(inpatient_concat)
    inpatient_concat_final=remove_low_frequency(inpatient_concat,inpatient_clustered)
    inpatient_concat_final_2=remove_low_variance(inpatient_concat_final)
    inpatient_concat_final_2=final_table(inpatient_concat_final_2)
    inpatient_concat_final_2.drop(['indexdate','sex' , 'age',  'region'  ,'plantyp','datatyp',  'rx',  'count'],axis=1,inplace=True)
    print(inpatient_concat_final_2.dx_truncated.unique())
    inpatient_concat_final_2.to_csv('Inpatient_dxcodes_final.csv', sep=",",index=False)
    print(inpatient_concat_final_2.head())
    #inpatient_concat_final_2.to_csv(output_file, sep=",")
    #print(inpatient_concat_final_2.dx_truncated.values)
    
    return inpatient_concat_final_2
    

In [12]:
def main():
        patient_cohort = pd.read_csv("hfref_pt_index.csv", low_memory=False)
        patient_cohort.columns = map(str.lower, patient_cohort.columns)
        inpatient_claims = pd.read_csv("concepts_data/patient_cohort_inpatient_claims.csv", low_memory=False)
        call_to_function(patient_cohort,inpatient_claims)
    
if __name__ == "__main__":
    main()

     dx_truncated  enrolid_count
1454          I50         990473
355           428         711604
354           427         474646
1162          E11         455556
343           414         438274
Total: No of Concepts  2446
Total: code occurences: 18822202
     dx_truncated  enrolid_count  occur_rate
1454          I50         990473    0.052623
355           428         711604    0.037807
354           427         474646    0.025217
1162          E11         455556    0.024203
343           414         438274    0.023285
     dx_truncated  enrolid_count  occur_rate
1454          I50         990473    0.052623
355           428         711604    0.037807
354           427         474646    0.025217
1162          E11         455556    0.024203
343           414         438274    0.023285
Final total codes:  (1712, 3)
       enrolid     svcdate indexdate dx_truncated  sex  age  region  plantyp  \
0    622513601  2015-03-20   9/27/16          427    2   67       2      2.0   
1   2573874

If the above code gives a IOPub error run below cell to check the concepts

In [None]:
medical_codes=pd.read_csv('inpatient_dxcodes_final.csv')
medical_codes.dx_truncated.unique()

## PROC Typ 1&7 Decodes

In [13]:
inpatient_claims = pd.read_csv("concepts_data/patient_cohort_inpatient_claims.csv", low_memory=False)
patient_cohort = pd.read_csv("hfref_pt_index.csv", low_memory=False)
patient_cohort.columns = map(str.lower, patient_cohort.columns)
inpatient_claims_joined = pd.merge(inpatient_claims, patient_cohort, on="enrolid", how="left")


In [14]:
subset=inpatient_claims_joined[inpatient_claims_joined['proctyp']=='1']
subset=subset[np.logical_not(subset['proc1'].isna())]
subset_cat1=subset[np.logical_not(subset.proc1.str.contains('F|T'))]
subset_cat2=subset[subset.proc1.str.contains('F')]
subset_cat1.proc1=subset_cat1.proc1.astype(int)

In [15]:
subset_cat1['Proc1_decodes']=''
subset_cat1.loc[((subset_cat1['proc1'] >= 0) & (subset_cat1['proc1'] <= 9999)),'Proc1_decodes']='Anesthesia Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 10000) & (subset_cat1['proc1'] <= 19999)),'Proc1_decodes']='Integumentary System'
subset_cat1.loc[((subset_cat1['proc1'] >= 20000) & (subset_cat1['proc1'] <= 29999)),'Proc1_decodes']='Musculoskeletal System'
subset_cat1.loc[((subset_cat1['proc1'] >= 30000) & (subset_cat1['proc1'] <= 39999)),'Proc1_decodes']='Respiratory, Cardiovascular, Hemic, and Lymphatic System'
subset_cat1.loc[((subset_cat1['proc1'] >= 40000) & (subset_cat1['proc1'] <= 49999)),'Proc1_decodes']='Digestive System'
subset_cat1.loc[((subset_cat1['proc1'] >= 50000) & (subset_cat1['proc1'] <= 59999)),'Proc1_decodes']='Urinary, Male Genital, Female Genital, Maternity Care, and Delivery System'
subset_cat1.loc[((subset_cat1['proc1'] >= 60000) & (subset_cat1['proc1'] <= 69999)),'Proc1_decodes']='Endocrine, Nervous, Eye and Ocular Adnexa, Auditory System'
subset_cat1.loc[((subset_cat1['proc1'] >= 70000) & (subset_cat1['proc1'] <= 79999)),'Proc1_decodes']='Radiology Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 80000) & (subset_cat1['proc1'] <= 89999)),'Proc1_decodes']='Pathology and Laboratory Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 90000) & (subset_cat1['proc1'] <= 99999)),'Proc1_decodes']='Evaluation & Management Services'

In [16]:
def truncate_code(dx_code):
    new_code = str(dx_code)[0:4]
    
    return new_code

subset_cat2['proc1_int_part'] = subset_cat2["proc1"].apply(lambda x: truncate_code(x))
subset_cat2.proc1_int_part=subset_cat2.proc1_int_part.astype(int)

In [17]:
subset_cat2['Proc1_decodes']=''
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 1) & (subset_cat2['proc1_int_part'] <= 15)),'Proc1_decodes']='Composite measures'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 500) & (subset_cat2['proc1_int_part'] <= 575)),'Proc1_decodes']='Patient management'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 1000) & (subset_cat2['proc1_int_part'] <= 1220)),'Proc1_decodes']='Patient history'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 2000) & (subset_cat2['proc1_int_part'] <= 2050)),'Proc1_decodes']='Physical examination'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 3006) & (subset_cat2['proc1_int_part'] <= 3573)),'Proc1_decodes']='Diagnostic/screening processes or results'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 4000) & (subset_cat2['proc1_int_part'] <= 4306)),'Proc1_decodes']='Therapeutic, preventive or other interventions'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 5005) & (subset_cat2['proc1_int_part'] <= 5100)),'Proc1_decodes']='Follow-up or other outcomes'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 6005) & (subset_cat2['proc1_int_part'] <= 6045)),'Proc1_decodes']='Patient safety'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 7010) & (subset_cat2['proc1_int_part'] <= 7025)),'Proc1_decodes']='Structural Measures'

In [18]:
subset_hcpc=inpatient_claims_joined[inpatient_claims_joined['proctyp']=='7']
subset_hcpc=subset_hcpc[np.logical_not(subset_hcpc['proc1'].isna())]

In [19]:
subset_hcpc['Proc1_decodes']=''
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('A')),'Proc1_decodes']='Transportation Services Including Ambulance, Medical & Surgical Supplies'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('B')),'Proc1_decodes']='Enteral and Parenteral Therapy'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('C')),'Proc1_decodes']='Temporary Codes for Use with Outpatient Prospective Payment System'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('E')),'Proc1_decodes']='Durable Medical Equipment (DME)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('G')),'Proc1_decodes']='Procedures/Professional Services (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('H')),'Proc1_decodes']='Alcohol and Drug Abuse Treatment Services / Rehabilitative Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('J')),'Proc1_decodes']='Drugs administered other than oral method, chemotherapy drugs'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('K')),'Proc1_decodes']="Durable Medical Equipment for Medicare Administrative Contractors (DME MACs)"
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('L')),'Proc1_decodes']='Orthotic and Prosthetic Procedures, Devices'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('M')),'Proc1_decodes']='Medical services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('P')),'Proc1_decodes']='Pathology and Laboratory Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('Q')),'Proc1_decodes']='Miscellaneous Services (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('R')),'Proc1_decodes']='Diagnostic Radiology Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('S')),'Proc1_decodes']='Commercial Payers (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('T')),'Proc1_decodes']='Established for State Medical Agencies'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('V')),'Proc1_decodes']='Vision, Hearing and Speech-Language Pathology Services'

In [20]:
final_subset=pd.concat([subset_cat1,subset_cat2,subset_hcpc])
final_df = final_subset[final_subset['Proc1_decodes']!='']
inpatient_clustered_procs = final_df[['enrolid', 'Proc1_decodes']]
inpatient_clustered_procs = inpatient_clustered_procs.groupby(['Proc1_decodes']).count().reset_index()
inpatient_clustered_procs = inpatient_clustered_procs.sort_values(by='enrolid', ascending=False)
inpatient_clustered_procs = inpatient_clustered_procs.rename(columns={"enrolid": "enrolid_count"})

In [21]:
total_occur = sum(inpatient_clustered_procs['enrolid_count'])
print ("Total: No of Concepts ", inpatient_clustered_procs.shape[0])
print ("Total: code occurences:", total_occur)
inpatient_clustered_procs['occur_rate'] = inpatient_clustered_procs['enrolid_count'].apply(lambda x: float(x/total_occur))
print (inpatient_clustered_procs.head())
inpatient_clustered_final_procs=inpatient_clustered_procs[inpatient_clustered_procs.enrolid_count > inpatient_clustered_procs.enrolid_count.quantile(.30)]
print (inpatient_clustered_final_procs.head())
print ("Final total codes: ", inpatient_clustered_final_procs.shape)
inpatient_clustered_final_procs = final_df[final_df['Proc1_decodes'].isin(inpatient_clustered_final_procs['Proc1_decodes'])]
print(inpatient_clustered_final_procs.head())
    

Total: No of Concepts  31
Total: code occurences: 3916781
                                        Proc1_decodes  enrolid_count  \
12                   Evaluation & Management Services        2634870   
24                                 Radiology Services         537761   
18                  Pathology and Laboratory Services         387555   
25  Respiratory, Cardiovascular, Hemic, and Lympha...         106875   
1                                 Anesthesia Services          68633   

    occur_rate  
12    0.672713  
24    0.137297  
18    0.098947  
25    0.027286  
1     0.017523  
                                        Proc1_decodes  enrolid_count  \
12                   Evaluation & Management Services        2634870   
24                                 Radiology Services         537761   
18                  Pathology and Laboratory Services         387555   
25  Respiratory, Cardiovascular, Hemic, and Lympha...         106875   
1                                 Anesthesia Se

In [22]:
inpatient_clustered_final_procs['count'] = inpatient_clustered_final_procs.groupby(['enrolid','svcdate','Proc1_decodes'])['age'].transform('count')
print('Before removing: No of Concepts ',inpatient_clustered_final_procs['Proc1_decodes'].nunique())
threshold = 0.5
inpatient_pivot_procs = inpatient_clustered_final_procs.pivot_table(index=['enrolid', 'svcdate'], columns='Proc1_decodes', values='count').fillna(0)
inpatient_pivot_procs.drop(inpatient_pivot_procs.std()[inpatient_pivot_procs.std() < threshold].index.values, axis = 1, inplace = True)
concepts_final_procs=inpatient_pivot_procs.columns.to_frame()    
inpatient_clustered_final_procs_2 = inpatient_clustered_final_procs[inpatient_clustered_final_procs['Proc1_decodes'].isin(concepts_final_procs['Proc1_decodes'])]
inpatient_clustered_final_procs_2.sort_values('enrolid',inplace=True)
inpatient_clustered_final_procs_2.Proc1_decodes.nunique()
print('After removing: No of Concepts ',inpatient_clustered_final_procs_2['Proc1_decodes'].nunique())
    

Before removing: No of Concepts  21
After removing: No of Concepts  5


In [23]:
inpatient_clustered_final_procs_2['StaticVar'] = inpatient_clustered_final_procs_2['sex'].astype(str) + ', ' + \
inpatient_clustered_final_procs_2['age'].astype(str) + ', ' +inpatient_clustered_final_procs_2['region'].astype(str) + ', '+ \
inpatient_clustered_final_procs_2['plantyp'].astype(str) + ', ' +inpatient_clustered_final_procs_2['datatyp'].astype(str)+ \
                                                           ', '+ inpatient_clustered_final_procs_2['rx'].astype(str)


In [24]:
#inpatient_clustered_final_procs_2.drop(['coins','copay','deduct','dx1','dx2','dx3','dx4','pdx','pay','proc1','proc1_int_part','indexdate','sex' , 'age',  'region' ,'proctyp' ,'plantyp','datatyp',  'rx', 'pproc', 'count','worsen','worsening_dt','stdplac'],axis=1,inplace=True)

In [None]:
inpatient_clustered_final_procs_2=inpatient_clustered_final_procs_2[['enrolid','svcdate','Proc1_decodes','StaticVar']]



In [25]:
inpatient_clustered_final_procs_2.to_csv('inpatient_proc_codes_final.csv')

In [None]:
inpatient_clustered_final_procs_2.Proc1_decodes.value_counts()
