In [34]:
import pandas as pd
import numpy as np
data = pd.read_csv('data/diabetis/diabetic_data.csv')

def get_vals_to_encode(in_series, freq_thresh):
    n_rows = in_series.shape[0]
    #if in_series.nunique() <= 4:
    #    out = [in_series.value_counts().idxmax()]
    #else:
    out = [val for val, val_count in in_series.value_counts().iteritems()
                        if (val_count > freq_thresh) and (val_count < (n_rows - freq_thresh))]
    return out

def my_encoder(in_df, freq_thresh=1000):
    out_df = pd.DataFrame()
    for cname, cdata in in_df.iteritems():
        #print(cname)
        #print(cdata)
        vals_to_encode = get_vals_to_encode(cdata, freq_thresh)
        for val in vals_to_encode:
                new_name = cname + '_' + val
                out_df[new_name] = cdata == val
                out_df[new_name] = out_df[new_name].astype(int)
    return out_df


In [35]:
sample_size = 10000

datac = data.copy()
# we remove those who can not be re-admitted
datac = datac.loc[~datac.discharge_disposition_id.isin([11,13,14,19,20,21])]
datac = datac.replace('?',np.nan)

# we make the conversion in order not to mix with numerical
columns_convert_to_categorical = ['encounter_id', 'patient_nbr', 'payer_code','medical_specialty','admission_type_id',
                   'discharge_disposition_id','admission_source_id']
for c in columns_convert_to_categorical:
    datac[c] = datac[c].astype('str')

datac_sample = datac.sample(sample_size, random_state=2)

id_column = datac.encounter_id
id_column_sample = datac_sample.encounter_id


#datac.drop(columns_to_drop,axis=1, inplace=True)
#datac_sample.drop(columns_to_drop,axis=1, inplace=True)

# encoding the outcome variable as ordinal
readmitted = datac.readmitted.replace(to_replace=['NO','>30','<30'],value=[0,1,2])
readmitted.to_csv('data/diabetis/class_sequence.txt',sep='\t',index=False,header=False)
readmitted_sample = datac_sample.readmitted.replace(to_replace=['NO','>30','<30'],value=[0,1,2])
#readmitted_sample = pd.concat([id_column_sample, readmitted_sample], axis=1)
readmitted_sample.to_csv('data/diabetis/class_sequence_sample.txt',sep='\t',index=False,header=False)

annotation_data = pd.DataFrame()
annotation_data_sample = pd.DataFrame()

In [36]:
# deal with original numerical + age

original_numeric = datac.select_dtypes(include='number')
original_numeric_sample = datac_sample.select_dtypes(include='number')
#display(original_numeric)

age_id = {'[0-10)':0, '[10-20)':1, '[20-30)':2, '[30-40)':3, '[40-50)':4, 
          '[50-60)':5, '[60-70)':6, '[70-80)':7, '[80-90)':8, '[90-100)':9}
age = datac.age.replace(age_id)
age_sample = datac_sample.age.replace(age_id)

In [37]:
# deal with simple categorical

simple_categorical = ['change','diabetesMed']

simple_cats = pd.DataFrame()
simple_cats_sample = pd.DataFrame()

for cat in simple_categorical:
    encoded_columns = my_encoder(datac[[cat]],freq_thresh=1000)
    simple_cats = pd.concat([simple_cats, encoded_columns],axis=1)
    encoded_columns = my_encoder(datac_sample[[cat]],freq_thresh=100)
    simple_cats_sample = pd.concat([simple_cats_sample, encoded_columns],axis=1)

simple_cats.drop(['change_No'], axis=1, inplace=True)
simple_cats.drop(['diabetesMed_No'], axis=1, inplace=True)
simple_cats_sample.drop(['change_No'], axis=1, inplace=True)
simple_cats_sample.drop(['diabetesMed_No'], axis=1, inplace=True)

In [38]:
# deal with admission/disposition categoricals (having hundreds of distinct values)
#'medical_specialty','admission_type_id','discharge_disposition_id','admission_source_id']

admdisp = pd.DataFrame()
admdisp_sample = pd.DataFrame()

admtype = my_encoder(datac[['admission_type_id']].astype('str'),freq_thresh=3000)
med_spec = my_encoder(datac[['medical_specialty']].astype('str'),freq_thresh=3000)
dischdisp = my_encoder(datac[['discharge_disposition_id']].astype('str'),freq_thresh=10000)
admsource = my_encoder(datac[['admission_source_id']].astype('str'),freq_thresh=10000)

admtype_sample = my_encoder(datac_sample[['admission_type_id']].astype('str'),freq_thresh=300)
med_spec_sample = my_encoder(datac_sample[['medical_specialty']].astype('str'),freq_thresh=300)
dischdisp_sample = my_encoder(datac_sample[['discharge_disposition_id']].astype('str'),freq_thresh=1000)
admsource_sample = my_encoder(datac_sample[['admission_source_id']].astype('str'),freq_thresh=1000)


admdisp = pd.concat([med_spec,admtype,dischdisp,admsource], axis=1)
admdisp_sample = pd.concat([med_spec_sample,admtype_sample,dischdisp_sample,admsource_sample], axis=1)

dict = {'1':'Emergency','2':'Urgent','3':'Elective','4':'Newborn','5':'Other','6':'Other','7':'Trauma Center','8':'Other'}
admtype_annot = datac['admission_type_id'].replace(dict)
admtype_annot_sample = datac_sample['admission_type_id'].replace(dict)

annotation_data = pd.concat([annotation_data,admtype_annot],axis=1)
annotation_data_sample = pd.concat([annotation_data_sample,admtype_annot_sample],axis=1)

In [39]:
# deal with diag columns

diag_columns = ['diag_1','diag_2','diag_3']

diags = pd.DataFrame()
diags_sample = pd.DataFrame()


for diag in diag_columns:
    ddiag = datac[diag]
    repl_dict = {}
    for i,val in ddiag.iteritems():
        val = str(val)
        #print(val,type(val),pd.isnull(val))
        if val.startswith('V'):
            repl_dict[val] = 'Other'
        elif val.startswith('E'):
            repl_dict[val] = 'Other'
        elif val=='nan':
            repl_dict[val] = 'Other'
            repl_dict[np.nan] = 'Other'
        else:
            valn = float(val)
            if int(valn)==250:
                repl_dict[val] = 'Diabetis'
            elif ((valn>=390) and (valn<=459)) or (valn==785):
                repl_dict[val] = 'Circulatory'
            elif ((valn>=460) and (valn<=519)) or (valn==786):
                repl_dict[val] = 'Respiratory'
            elif ((valn>=520) and (valn<=579)) or (valn==787):
                repl_dict[val] = 'Digestive'
            elif ((valn>=800) and (valn<=999)):
                repl_dict[val] = 'Injury'
            else:
                repl_dict[val] = 'Other'
    ddiag = ddiag.replace(repl_dict)
    ddiag_sample = datac_sample[diag].replace(repl_dict)
    annotation_data = pd.concat([annotation_data,ddiag],axis=1)
    annotation_data_sample = pd.concat([annotation_data_sample,ddiag_sample],axis=1)

In [40]:
# process glucose measurements

glucose_measurements = ['max_glu_serum','A1Cresult']

glucose_encoded = pd.DataFrame()
glucose_encoded_sample = pd.DataFrame()
glucose_rankencoded = pd.DataFrame()
glucose_rankencoded_sample = pd.DataFrame()
A1C_rankencoded = pd.DataFrame()
A1C_rankencoded_sample = pd.DataFrame()


#if variant_of_encoding=='DUMMY':
for glu in glucose_measurements:
    encoded_columns = my_encoder(datac[[glu]],freq_thresh=1000)
    glucose_encoded = pd.concat([glucose_encoded, encoded_columns],axis=1)
for glu in glucose_measurements:
    encoded_columns = my_encoder(datac_sample[[glu]],freq_thresh=100)
    glucose_encoded_sample = pd.concat([glucose_encoded_sample, encoded_columns],axis=1)

glucose_encoded.loc[(glucose_encoded['max_glu_serum_>300']==1),'max_glu_serum_>200']=1
glucose_encoded_sample.loc[(glucose_encoded_sample['max_glu_serum_>300']==1),'max_glu_serum_>200']=1
glucose_encoded.loc[(glucose_encoded['A1Cresult_>8']==1),'A1Cresult_>7']=1
glucose_encoded_sample.loc[(glucose_encoded_sample['A1Cresult_>8']==1),'A1Cresult_>7']=1


#if variant_of_encoding=='USE_RANK_AND_MISSING':
glu = 'A1Cresult'
A1C_rankencoded[glu+'_None'] = datac[glu] == 'None'
A1C_rankencoded[glu+'_None'] = A1C_rankencoded[glu+'_None'].astype(int)
A1C_rankencoded_sample[glu+'_None'] = datac_sample[glu] == 'None'
A1C_rankencoded_sample[glu+'_None'] = A1C_rankencoded_sample[glu+'_None'].astype(int)
A1C_rankencoded[glu+'_val'] = datac[glu].replace({'None':np.nan,'Norm':0,'>200':1,'>300':2,'>7':1,'>8':2})
A1C_rankencoded_sample[glu+'_val'] = datac_sample[glu].replace({'None':np.nan,'Norm':0,'>200':1,'>300':2,'>7':1,'>8':2})

glu = 'max_glu_serum'
glucose_rankencoded[glu+'_None'] = datac[glu] == 'None'
glucose_rankencoded[glu+'_None'] = glucose_rankencoded[glu+'_None'].astype(int)
glucose_rankencoded_sample[glu+'_None'] = datac_sample[glu] == 'None'
glucose_rankencoded_sample[glu+'_None'] = glucose_rankencoded_sample[glu+'_None'].astype(int)
glucose_rankencoded[glu+'_val'] = datac[glu].replace({'None':np.nan,'Norm':0,'>200':1,'>300':2,'>7':1,'>8':2})
glucose_rankencoded_sample[glu+'_val'] = datac_sample[glu].replace({'None':np.nan,'Norm':0,'>200':1,'>300':2,'>7':1,'>8':2})


In [41]:
# process medications

medication_columns = ['metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

#medication_freq = ['metformin','glipizide','glyburide','insulin']

medications = pd.DataFrame()
medications_sample = pd.DataFrame()
medications_enc = pd.DataFrame()
medications_enc_sample = pd.DataFrame()


for med in medication_columns:
    meds = datac[med].value_counts()
    if 1-meds[0]/datac.shape[0]>0.10:
        print(med,1-meds[0]/datac.shape[0],'\n',meds,'\n')
        medications[med+'_treatment'] = datac[med]
        medications[med+'_treatment']=medications[med+'_treatment'].replace({'No':0,'Steady':1,'Up':1,'Down':1})
        medications[med+'_treatment_change'] = datac[med]
        medications[med+'_treatment_change']=medications[med+'_treatment_change'].replace({'No':0,'Steady':0,'Up':1,'Down':-1})
        medications_sample[med+'_treatment'] = datac_sample[med]
        medications_sample[med+'_treatment']= medications_sample[med+'_treatment'].replace({'No':0,'Steady':1,'Up':1,'Down':1})
        medications_sample[med+'_treatment_change'] = datac_sample[med]
        medications_sample[med+'_treatment_change'] = medications_sample[med+'_treatment_change'].replace({'No':0,'Steady':0,'Up':1,'Down':-1})
        medications_enc = pd.concat([medications_enc,my_encoder(datac[[med]])],axis=1)
        medications_enc_sample = pd.concat([medications_enc_sample,my_encoder(datac_sample[[med]])],axis=1)
        
        annotation_data = pd.concat([annotation_data,datac[med]],axis=1)
        annotation_data_sample = pd.concat([annotation_data_sample,datac_sample[med]],axis=1)
        
#display(medications)


metformin 0.1997523730912092 
 No        79499
Steady    18207
Up         1063
Down        574
Name: metformin, dtype: int64 

glipizide 0.12613873146572985 
 No        86812
Steady    11219
Up          764
Down        548
Name: glipizide, dtype: int64 

glyburide 0.10592593338232181 
 No        88820
Steady     9162
Up          801
Down        560
Name: glyburide, dtype: int64 

insulin 0.5331427478533968 
 No        46379
Steady    30069
Down      11908
Up        10987
Name: insulin, dtype: int64 



In [42]:
# save assembled datasets

### First, only numerical and age
print('Numerical columns:',original_numeric.columns)
numerical_df = pd.concat([id_column, original_numeric, age, readmitted], axis=1)
numerical_df.to_csv('data/diabetis/diabetic_numerical.txt',sep='\t',index=False)
numerical_df_sample = pd.concat([id_column_sample, original_numeric_sample, age_sample, readmitted_sample], axis=1)
numerical_df_sample.to_csv('data/diabetis/diabetic_numerical_sample1.txt',sep='\t',index=False)

### Write only the one with A1C measured

df = pd.concat([id_column, original_numeric, age, A1C_rankencoded, simple_cats, medications, readmitted], axis=1)        
df_A1Cmeasured = df.loc[df.A1Cresult_None==0].copy()
df_A1Cmeasured.drop(['A1Cresult_None'], axis=1, inplace=True)
df_A1Cmeasured.to_csv('data/diabetis/diabete_A1C.txt',sep='\t',index=False)
df_A1Cmeasured.readmitted.to_csv('data/diabetis/class_sequence_A1C.txt',sep='\t',index=False,header=False)

### Write only the patients with diabetis as primary diagnosis

### Complete set of columns
df = pd.concat([id_column, original_numeric, age, glucose_encoded, simple_cats, medications, readmitted], axis=1)        
df_sample = pd.concat([id_column_sample, original_numeric_sample, age_sample, glucose_encoded_sample, 
                        simple_cats_sample, medications_sample, readmitted_sample], axis=1)
df.to_csv('data/diabetis/diabete.txt',sep='\t',index=False)
df_sample.to_csv('data/diabetis/diabete_sample.txt',sep='\t',index=False)


### Annotation files
adf = pd.concat([id_column,annotation_data],axis=1)
adf_sample = pd.concat([id_column_sample,annotation_data_sample],axis=1)
adf.to_csv('data/diabetis/diabete_annot.txt',sep='\t',index=False)
adf_sample.to_csv('data/diabetis/diabete_annot_sample.txt',sep='\t',index=False)


### Complete set of columns, with glucose level encoded as ordinal vars
df = pd.concat([id_column, original_numeric, age, A1C_rankencoded, #glucose_rankencoded, 
                simple_cats, medications, readmitted], axis=1)  
df_sample = pd.concat([id_column_sample, original_numeric_sample, age_sample, A1C_rankencoded_sample,
                       #glucose_rankencoded_sample, 
                       simple_cats_sample,  medications_sample, readmitted_sample], axis=1)
df.to_csv('data/diabetis/diabete_glr.txt',sep='\t',index=False)
df_sample.to_csv('data/diabetis/diabete_glr_sample.txt',sep='\t',index=False)



Numerical columns: Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')


In [7]:
# code reused from https://www.kaggle.com/dansbecker/set-up-diabetes-data-for-modeling

import pandas as pd
data = pd.read_csv('data/diabetis/diabetic_data.csv')


readmited_col = data['readmitted']

categoricals = data.select_dtypes(include=object)
encoded_data = my_encoder(categoricals, 2500)
original_numeric = data.select_dtypes(include='number')
output = pd.concat([original_numeric, encoded_data, readmited_col], axis=1)

# readmitted is prediction target. Flip it from readmitted_no to readmitted because that's conventional
output['readmitted_binary'] = 1-output.readmitted_NO
output.drop(['readmitted_NO'], axis=1, inplace=True)

# remove ID variables, and others that are hard to interpret because we don't have good labels
output.drop(['encounter_id', 'patient_nbr', 'admission_type_id',
             'discharge_disposition_id', 'admission_source_id'], axis=1, inplace=True)

output = output.sample(10000, random_state=1)
readmited = output['readmitted'].copy()
readmited.to_csv('data/diabetis/classes.csv', index=False)

output.drop(['readmitted'], axis=1, inplace=True)
output.drop(['readmitted_binary'], axis=1, inplace=True)

output.to_csv('data/diabetis/diabetis.csv', index=True)