In [2]:
from __future__ import print_function
import pandas as pd
import dask.dataframe as dd
import numpy as np
import sys
import gc
from multiprocessing.pool import Pool
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
def join_with_hfref(patient_cohort,outpatient_claims):
    gc.collect()
    outpatient_claims_joined = pd.merge(outpatient_claims, patient_cohort, on="enrolid", how="left")    
    print('Join Completed')
    return outpatient_claims_joined

In [3]:
def preprocess(outpatient_claims_joined):
    gc.collect()
    outpatient_dx1 = outpatient_claims_joined.drop(columns=['dx2', 'dx3', 'dx4'])
    outpatient_dx2 = outpatient_claims_joined.drop(columns=['dx1', 'dx3', 'dx4'])
    outpatient_dx3 = outpatient_claims_joined.drop(columns=['dx1', 'dx2', 'dx4'])
    outpatient_dx4 = outpatient_claims_joined.drop(columns=['dx1', 'dx2', 'dx3'])
    
    outpatient_dx1["dx_truncated"] = outpatient_dx1["dx1"].apply(lambda x: truncate_code(x))
    outpatient_dx2["dx_truncated"] = outpatient_dx2["dx2"].apply(lambda x: truncate_code(x))
    outpatient_dx3["dx_truncated"] = outpatient_dx3["dx3"].apply(lambda x: truncate_code(x))
    outpatient_dx4["dx_truncated"] = outpatient_dx4["dx4"].apply(lambda x: truncate_code(x))
    
    outpatient_dx1_select = outpatient_dx1[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    outpatient_dx2_select = outpatient_dx2[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    outpatient_dx3_select = outpatient_dx3[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]
    outpatient_dx4_select = outpatient_dx4[['enrolid', 'svcdate', 'indexdate', 'dx_truncated', 'sex', 'age', 'region', 'plantyp', 'datatyp', 'rx']]

    outpatient_concat = pd.concat([outpatient_dx1_select, outpatient_dx2_select, outpatient_dx3_select,outpatient_dx4_select], ignore_index=True)
    outpatient_concat = outpatient_concat[outpatient_concat['dx_truncated']!='nan']
    
    del outpatient_dx1,outpatient_dx2,outpatient_dx3,outpatient_dx4,outpatient_dx1_select,outpatient_dx2_select,outpatient_dx3_select,outpatient_dx4_select
    gc.collect()
    
    print('Column PreProcessing Completed')
    return outpatient_concat
    
    
def truncate_code(dx_code):
    new_code = str(dx_code)[0:3]
    
    return new_code    

In [4]:
def calculate_clusters(outpatient_concat):
    gc.collect()
    outpatient_clustered = outpatient_concat[['enrolid', 'dx_truncated']]
    outpatient_clustered = outpatient_clustered.groupby(['dx_truncated']).count().reset_index()
    outpatient_clustered = outpatient_clustered.sort_values(by='enrolid', ascending=False)
    outpatient_clustered = outpatient_clustered.rename(columns={"dx_truncated": "dx_truncated", "enrolid": "enrolid_count"})
    print (outpatient_clustered.head())
    
    del outpatient_concat
    gc.collect()
    
    print('Clusters Computed')
    return outpatient_clustered

In [5]:
def remove_low_frequency(outpatient_concat,outpatient_clustered):
    gc.collect()
    total_occur = sum(outpatient_clustered['enrolid_count'])
    print ("Total: No of Concepts ", outpatient_clustered.shape[0])
    print ("Total: code occurences:", total_occur)
    outpatient_clustered['occur_rate'] = outpatient_clustered['enrolid_count'].apply(lambda x: float(x/total_occur))
    print (outpatient_clustered.head())
    outpatient_clustered_final=outpatient_clustered[outpatient_clustered.enrolid_count > outpatient_clustered.enrolid_count.quantile(.30)]
    print (outpatient_clustered_final.head())
    print ("Final total codes: ", outpatient_clustered_final.shape)
    outpatient_concat_final = outpatient_concat[outpatient_concat['dx_truncated'].isin(outpatient_clustered_final['dx_truncated'])]
    print(outpatient_concat_final.head())
    
    
    del outpatient_concat,outpatient_clustered
    gc.collect()
    
    print('Removed Low frequency occurences')
    return outpatient_concat_final

In [6]:
def pivot_outpatient_remove_low_variance():
    new_df=pd.DataFrame()
    df=pd.read_csv('outpatient_concat_final_bigfile.csv')
    df.drop(['Unnamed: 0'],axis=1,inplace=True)
    df['svcdate']=pd.to_datetime(df['svcdate'])
    df['year']=df['svcdate'].dt.year
    year=[x for x in df.year.unique()]
    yeardict = {elem : pd.DataFrame() for elem in year}
    
    for key in yeardict.keys():
        yeardict[key] = df[:][df.year == key]
        
    for key in tqdm(yeardict.keys(), 'Removing Low Variance step by step'):
        print ('For Year: ',key)
        globals()["df_pivot_" + str(key)] = remove_low_variance_new(yeardict[key])
    
    print('Starting Concatenation: ')
    for key in tqdm(yeardict.keys(), 'Concatenating Tables'):
        new_df=pd.concat([globals()["df_pivot_" + str(key)],new_df])  
        
    print('Ended Concatenation: ')    
        
    new_df['StaticVar'] = new_df['sex'].astype(str) + ', ' + \
    new_df['age'].astype(str) + ', ' +new_df['region'].astype(str) + ', '+ \
    new_df['plantyp'].astype(str) + ', ' +new_df['datatyp'].astype(str)+ \
                                                           ', '+ new_df['rx'].astype(str)

    new_df.drop(['indexdate','sex' , 'age',  'region'  ,'plantyp','datatyp',  'rx',  'count','year'],axis=1,inplace=True)
    new_df.to_csv('outpatient_dxcodes_final.csv')
    print(new_df.head())

In [7]:
def remove_low_variance_new(outpatient_concat_final):
    print('Before removing: No of Concepts ',outpatient_concat_final['dx_truncated'].nunique())
    threshold = 0.5
    outpatient_pivot = outpatient_concat_final.pivot_table(index=['enrolid', 'svcdate'], columns='dx_truncated', values='count').fillna(0)
    outpatient_pivot.drop(outpatient_pivot.std()[outpatient_pivot.std() < threshold].index.values, axis = 1, inplace = True)
    concepts_final=outpatient_pivot.columns.to_frame()    
    outpatient_concat_final_2 = outpatient_concat_final[outpatient_concat_final['dx_truncated'].isin(concepts_final['dx_truncated'])]
    outpatient_concat_final_2.sort_values('enrolid',inplace=True)
    outpatient_concat_final_2.dx_truncated.nunique()
    print('After removing: No of Concepts ',outpatient_concat_final_2['dx_truncated'].nunique())
    
    return outpatient_concat_final_2

In [8]:
def call_to_function(patient_cohort,outpatient_claims):
    gc.collect()
    outpatient_claims_joined=join_with_hfref(patient_cohort,outpatient_claims)
    outpatient_concat=preprocess(outpatient_claims_joined)
    gc.collect()
    outpatient_clustered=calculate_clusters(outpatient_concat)
    outpatient_concat_final=remove_low_frequency(outpatient_concat,outpatient_clustered)
    gc.collect()
    outpatient_concat_final['count'] = outpatient_concat_final.groupby(['enrolid','svcdate','dx_truncated'])['age'].transform('count')
    #print('Before removing: No of Concepts ',outpatient_concat_final['dx_truncated'].nunique())
    outpatient_concat_final.to_csv('outpatient_concat_final_bigfile.csv',sep=',');
    
    del outpatient_concat_final
    gc.collect()    

In [9]:
def main():
        patient_cohort = pd.read_csv("hfref_pt_index.csv", low_memory=False)
        patient_cohort.columns = map(str.lower, patient_cohort.columns)
        outpatient_claims = pd.read_csv("concepts_data/patient_cohort_outpatient_claims.csv", low_memory=False)
        gc.collect()
        call_to_function(patient_cohort,outpatient_claims)
        pivot_outpatient_remove_low_variance()
    
if __name__ == "__main__":
    main()

     dx_truncated  enrolid_count
224           250        2204069
1251          E11        1999938
1570          I50        1917985
1909          N18        1890688
371           401        1884651
Total: No of Concepts  2709
Total: code occurences: 72323940
     dx_truncated  enrolid_count  occur_rate
224           250        2204069    0.030475
1251          E11        1999938    0.027653
1570          I50        1917985    0.026519
1909          N18        1890688    0.026142
371           401        1884651    0.026058
     dx_truncated  enrolid_count  occur_rate
224           250        2204069    0.030475
1251          E11        1999938    0.027653
1570          I50        1917985    0.026519
1909          N18        1890688    0.026142
371           401        1884651    0.026058
Final total codes:  (1896, 3)
      enrolid     svcdate indexdate dx_truncated  sex  age  region  plantyp  \
0  2845932202  2016-03-11    4/5/16          L89    1   63       2      5.0   
1   607591401

HBox(children=(IntProgress(value=0, description='Removing Low Variance step by step', max=24, style=ProgressSt…

For Year:  2016
Before removing: No of Concepts  1162
After removing: No of Concepts  41
For Year:  2013
Before removing: No of Concepts  793
After removing: No of Concepts  33
For Year:  2015
Before removing: No of Concepts  1885
After removing: No of Concepts  37
For Year:  2011
Before removing: No of Concepts  783
After removing: No of Concepts  18
For Year:  2017
Before removing: No of Concepts  1143
After removing: No of Concepts  39
For Year:  2014
Before removing: No of Concepts  794
After removing: No of Concepts  38
For Year:  2008
Before removing: No of Concepts  772
After removing: No of Concepts  11
For Year:  2005
Before removing: No of Concepts  751
After removing: No of Concepts  9
For Year:  2012
Before removing: No of Concepts  788
After removing: No of Concepts  33
For Year:  2010
Before removing: No of Concepts  780
After removing: No of Concepts  15
For Year:  2004
Before removing: No of Concepts  744
After removing: No of Concepts  8
For Year:  2002
Before removing

HBox(children=(IntProgress(value=0, description='Concatenating Tables', max=24, style=ProgressStyle(descriptio…


Ended Concatenation: 
          enrolid    svcdate dx_truncated            StaticVar
27751237   927701 2018-06-20          I50  1, 79, 2, 6.0, 3, 1
60745859   927701 2018-03-06          I10  1, 79, 2, 6.0, 3, 1
45774794   927701 2018-03-06          E11  1, 79, 2, 6.0, 3, 1
58588274   927701 2018-02-16          J44  1, 79, 2, 6.0, 3, 1
60685546   927701 2018-05-01          I25  1, 79, 2, 6.0, 3, 1


## PROC Typ 1&7 Decodes

In [30]:
outpatient_claims = pd.read_csv("concepts_data/patient_cohort_outpatient_claims.csv", low_memory=False)
patient_cohort = pd.read_csv("hfref_pt_index.csv", low_memory=False)
patient_cohort.columns = map(str.lower, patient_cohort.columns)
outpatient_claims_joined = pd.merge(outpatient_claims, patient_cohort, on="enrolid", how="left")


In [4]:
outpatient_claims = pd.read_csv("concepts_data/patient_cohort_outpatient_claims.csv", low_memory=False)
patient_cohort = pd.read_csv("hfref_pt_index.csv", low_memory=False)
patient_cohort.columns = map(str.lower, patient_cohort.columns)
outpatient_claims_joined = pd.merge(outpatient_claims, patient_cohort, on="enrolid", how="left")



outpatient_claims_joined.shape


(33408115, 22)

In [None]:
(33408115, 19)
In [31]:


In [31]:
subset=outpatient_claims_joined[outpatient_claims_joined['proctyp']=='1']
subset=subset[np.logical_not(subset['proc1'].isna())]
subset_cat1=subset[np.logical_not(subset.proc1.str.contains('F|T|U'))]
subset_cat2=subset[subset.proc1.str.contains('F')]
subset_cat1.proc1=subset_cat1.proc1.astype(int)

In [32]:
subset_cat1['Proc1_decodes']=''
subset_cat1.loc[((subset_cat1['proc1'] >= 0) & (subset_cat1['proc1'] <= 9999)),'Proc1_decodes']='Anesthesia Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 10000) & (subset_cat1['proc1'] <= 19999)),'Proc1_decodes']='Integumentary System'
subset_cat1.loc[((subset_cat1['proc1'] >= 20000) & (subset_cat1['proc1'] <= 29999)),'Proc1_decodes']='Musculoskeletal System'
subset_cat1.loc[((subset_cat1['proc1'] >= 30000) & (subset_cat1['proc1'] <= 39999)),'Proc1_decodes']='Respiratory, Cardiovascular, Hemic, and Lymphatic System'
subset_cat1.loc[((subset_cat1['proc1'] >= 40000) & (subset_cat1['proc1'] <= 49999)),'Proc1_decodes']='Digestive System'
subset_cat1.loc[((subset_cat1['proc1'] >= 50000) & (subset_cat1['proc1'] <= 59999)),'Proc1_decodes']='Urinary, Male Genital, Female Genital, Maternity Care, and Delivery System'
subset_cat1.loc[((subset_cat1['proc1'] >= 60000) & (subset_cat1['proc1'] <= 69999)),'Proc1_decodes']='Endocrine, Nervous, Eye and Ocular Adnexa, Auditory System'
subset_cat1.loc[((subset_cat1['proc1'] >= 70000) & (subset_cat1['proc1'] <= 79999)),'Proc1_decodes']='Radiology Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 80000) & (subset_cat1['proc1'] <= 89999)),'Proc1_decodes']='Pathology and Laboratory Services'
subset_cat1.loc[((subset_cat1['proc1'] >= 90000) & (subset_cat1['proc1'] <= 99999)),'Proc1_decodes']='Evaluation & Management Services'

In [33]:
def truncate_code(dx_code):
    new_code = str(dx_code)[0:4]
    
    return new_code

subset_cat2['proc1_int_part'] = subset_cat2["proc1"].apply(lambda x: truncate_code(x))
subset_cat2.proc1_int_part=subset_cat2.proc1_int_part.astype(int)

In [34]:
subset_cat2['Proc1_decodes']=''
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 1) & (subset_cat2['proc1_int_part'] <= 15)),'Proc1_decodes']='Composite measures'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 500) & (subset_cat2['proc1_int_part'] <= 575)),'Proc1_decodes']='Patient management'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 1000) & (subset_cat2['proc1_int_part'] <= 1220)),'Proc1_decodes']='Patient history'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 2000) & (subset_cat2['proc1_int_part'] <= 2050)),'Proc1_decodes']='Physical examination'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 3006) & (subset_cat2['proc1_int_part'] <= 3573)),'Proc1_decodes']='Diagnostic/screening processes or results'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 4000) & (subset_cat2['proc1_int_part'] <= 4306)),'Proc1_decodes']='Therapeutic, preventive or other interventions'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 5005) & (subset_cat2['proc1_int_part'] <= 5100)),'Proc1_decodes']='Follow-up or other outcomes'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 6005) & (subset_cat2['proc1_int_part'] <= 6045)),'Proc1_decodes']='Patient safety'
subset_cat2.loc[((subset_cat2['proc1_int_part'] >= 7010) & (subset_cat2['proc1_int_part'] <= 7025)),'Proc1_decodes']='Structural Measures'

In [35]:
subset_hcpc=outpatient_claims_joined[outpatient_claims_joined['proctyp']=='7']
subset_hcpc=subset_hcpc[np.logical_not(subset_hcpc['proc1'].isna())]

In [36]:
subset_hcpc['Proc1_decodes']=''
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('A')),'Proc1_decodes']='Transportation Services Including Ambulance, Medical & Surgical Supplies'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('B')),'Proc1_decodes']='Enteral and Parenteral Therapy'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('C')),'Proc1_decodes']='Temporary Codes for Use with Outpatient Prospective Payment System'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('E')),'Proc1_decodes']='Durable Medical Equipment (DME)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('G')),'Proc1_decodes']='Procedures/Professional Services (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('H')),'Proc1_decodes']='Alcohol and Drug Abuse Treatment Services / Rehabilitative Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('J')),'Proc1_decodes']='Drugs administered other than oral method, chemotherapy drugs'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('K')),'Proc1_decodes']="Durable Medical Equipment for Medicare Administrative Contractors (DME MACs)"
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('L')),'Proc1_decodes']='Orthotic and Prosthetic Procedures, Devices'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('M')),'Proc1_decodes']='Medical services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('P')),'Proc1_decodes']='Pathology and Laboratory Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('Q')),'Proc1_decodes']='Miscellaneous Services (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('R')),'Proc1_decodes']='Diagnostic Radiology Services'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('S')),'Proc1_decodes']='Commercial Payers (Temporary Codes)'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('T')),'Proc1_decodes']='Established for State Medical Agencies'
subset_hcpc.loc[(subset_hcpc['proc1'].str.startswith('V')),'Proc1_decodes']='Vision, Hearing and Speech-Language Pathology Services'

In [37]:
final_subset=pd.concat([subset_cat1,subset_cat2,subset_hcpc])
final_df = final_subset[final_subset['Proc1_decodes']!='']
outpatient_clustered_procs = final_df[['enrolid', 'Proc1_decodes']]
outpatient_clustered_procs = outpatient_clustered_procs.groupby(['Proc1_decodes']).count().reset_index()
outpatient_clustered_procs = outpatient_clustered_procs.sort_values(by='enrolid', ascending=False)
outpatient_clustered_procs = outpatient_clustered_procs.rename(columns={"enrolid": "enrolid_count"})

In [38]:
total_occur = sum(outpatient_clustered_procs['enrolid_count'])
print ("Total: No of Concepts ", outpatient_clustered_procs.shape[0])
print ("Total: code occurences:", total_occur)
outpatient_clustered_procs['occur_rate'] = outpatient_clustered_procs['enrolid_count'].apply(lambda x: float(x/total_occur))
print (outpatient_clustered_procs.head())
outpatient_clustered_final_procs=outpatient_clustered_procs[outpatient_clustered_procs.enrolid_count > outpatient_clustered_procs.enrolid_count.quantile(.30)]
print (outpatient_clustered_final_procs.head())
print ("Final total codes: ", outpatient_clustered_final_procs.shape)
outpatient_clustered_final_procs = final_df[final_df['Proc1_decodes'].isin(outpatient_clustered_final_procs['Proc1_decodes'])]
print(outpatient_clustered_final_procs.head())
    

Total: No of Concepts  34
Total: code occurences: 30927641
                                        Proc1_decodes  enrolid_count  \
13                   Evaluation & Management Services       13391911   
20                  Pathology and Laboratory Services        6067066   
26                                 Radiology Services        2306562   
31  Transportation Services Including Ambulance, M...        1547134   
25  Procedures/Professional Services (Temporary Co...        1529268   

    occur_rate  
13    0.433008  
20    0.196170  
26    0.074579  
31    0.050024  
25    0.049447  
                                        Proc1_decodes  enrolid_count  \
13                   Evaluation & Management Services       13391911   
20                  Pathology and Laboratory Services        6067066   
26                                 Radiology Services        2306562   
31  Transportation Services Including Ambulance, M...        1547134   
25  Procedures/Professional Services (Temporar

In [39]:
outpatient_clustered_final_procs['count'] = outpatient_clustered_final_procs.groupby(['enrolid','svcdate','Proc1_decodes'])['age'].transform('count')
print('Before removing: No of Concepts ',outpatient_clustered_final_procs['Proc1_decodes'].nunique())
threshold = 0.5
outpatient_pivot_procs = outpatient_clustered_final_procs.pivot_table(index=['enrolid', 'svcdate'], columns='Proc1_decodes', values='count').fillna(0)
outpatient_pivot_procs.drop(outpatient_pivot_procs.std()[outpatient_pivot_procs.std() < threshold].index.values, axis = 1, inplace = True)
concepts_final_procs=outpatient_pivot_procs.columns.to_frame()    
outpatient_clustered_final_procs_2 = outpatient_clustered_final_procs[outpatient_clustered_final_procs['Proc1_decodes'].isin(concepts_final_procs['Proc1_decodes'])]
outpatient_clustered_final_procs_2.sort_values('enrolid',inplace=True)
outpatient_clustered_final_procs_2.Proc1_decodes.nunique()
print('After removing: No of Concepts ',outpatient_clustered_final_procs_2['Proc1_decodes'].nunique())
    

Before removing: No of Concepts  24
After removing: No of Concepts  6


In [40]:
outpatient_clustered_final_procs_2['StaticVar'] = outpatient_clustered_final_procs_2['sex'].astype(str) + ', ' + \
outpatient_clustered_final_procs_2['age'].astype(str) + ', ' +outpatient_clustered_final_procs_2['region'].astype(str) + ', '+ \
outpatient_clustered_final_procs_2['plantyp'].astype(str) + ', ' +outpatient_clustered_final_procs_2['datatyp'].astype(str)+ \
                                                           ', '+ outpatient_clustered_final_procs_2['rx'].astype(str)


In [41]:
#outpatient_clustered_final_procs_2.drop(['coins','copay','deduct','dx1','dx2','dx3','dx4','pay','proc1','proc1_int_part','indexdate','sex' , 'age',  'region' ,'proctyp' ,'plantyp','datatyp',  'rx',  'count','worsen','worsening_dt','stdplac'],axis=1,inplace=True)

In [58]:
outpatient_clustered_final_procs_2.Proc1_decodes.value_counts()

Evaluation & Management Services                                            13391911
Pathology and Laboratory Services                                            6067066
Radiology Services                                                           2306562
Transportation Services Including Ambulance, Medical & Surgical Supplies     1547134
Procedures/Professional Services (Temporary Codes)                           1529268
Drugs administered other than oral method, chemotherapy drugs                1508488
Name: Proc1_decodes, dtype: int64

In [None]:
outpatient_clustered_final_procs_2=outpatient_clustered_final_procs_2[['enrolid','svcdate','Proc1_decodes','StaticVar']]
outpatient_clustered_final_procs_2.head()


In [None]:
outpatient_clustered_final_procs_2.to_csv('outpatient_proc_codes_final.csv')