In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [2]:
# load data
raw = pd.read_csv('diabetic_data.csv')

In [3]:
# using top raw features and target variable
top_features = ['num_lab_procedures', 'num_medications', 'time_in_hospital', 'age',
           'number_diagnoses', 'num_procedures', 'number_inpatient', 'number_outpatient',
            'number_emergency','gender','change','insulin','metformin', 'not_diabetes_related', 
                'diag_2','circulatory','readmitted']

raw = raw.loc[:, top_features]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [4]:
raw

Unnamed: 0,num_lab_procedures,num_medications,time_in_hospital,age,number_diagnoses,num_procedures,number_inpatient,number_outpatient,number_emergency,gender,change,insulin,metformin,not_diabetes_related,diag_2,circulatory,readmitted
0,41,1,1,[0-10),1,0,0,0,0,Female,No,No,No,,?,,NO
1,59,18,3,[10-20),9,0,0,0,0,Female,Ch,Up,No,,250.01,,>30
2,11,13,2,[20-30),6,5,1,2,0,Female,No,No,No,,250,,NO
3,44,16,2,[30-40),7,1,0,0,0,Male,Ch,Up,No,,250.43,,NO
4,51,8,1,[40-50),5,0,0,0,0,Male,Ch,Steady,No,,157,,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,51,16,3,[70-80),9,0,0,0,0,Male,Ch,Down,Steady,,291,,>30
101762,33,18,5,[80-90),9,3,1,0,0,Female,No,Steady,No,,276,,NO
101763,53,9,1,[70-80),13,0,0,1,0,Male,Ch,Down,Steady,,590,,NO
101764,45,21,10,[80-90),9,2,1,0,0,Female,Ch,Up,No,,285,,NO


In [5]:
# leaving 10% of data in holdout set
hold = raw.sample(frac=0.1, replace=True, random_state=1)
df = raw.sample(frac=0.9, replace=True, random_state=1)

In [6]:
# preprocessing raw data
df['insulin'] = df['insulin'].replace(['Steady','Up','Down'],1)
df['insulin'] = df['insulin'].replace('No', 0)

df['change']=df.loc[:,('change')].replace('No', 0)
df['change']=df.loc[:,('change')].replace('Ch', 1)

for i in range(0,10):
    df['age'] = df['age'].replace('[' + str(i*10) + '-' + str(10*(i+1)) + ')', i+1)

df = df[~df['gender'].str.contains('Unknown/Invalid')]
gender_d=pd.get_dummies(df['gender'],prefix='gender')
df=pd.concat([df,gender_d],axis=1)
df['Female'] = df.gender_Female
df['gender']=df['gender'].replace('Male',0)
df['gender']=df['gender'].replace('Female',1)
df.drop(['gender','gender_Male','gender_Female'],axis=1,inplace=True)

df['metformin'] = df['metformin'].replace(['Steady','Up','Down'],1)
df['metformin'] = df['metformin'].replace('No', 0)

def map_diag(diag_code):
    """ 
    Mapping diagnosis ID code to disease/disorder
    :param diag_code: number
    :return: category for a diagnosis
    """
    if "V" in str(diag_code) or "E" in str(diag_code):
        diag_category='external injury and supplemental'
    elif float(diag_code) is 0:
        diag_category='N/A'
    elif float(diag_code) < 140:
        diag_category='infectious and parasitic'
    elif float(diag_code) >= 140 and float(diag_code) < 240:
        diag_category='neoplasms'
    elif float(diag_code) >= 240 and float(diag_code) < 249:
        diag_category='thyroid'
    elif float(diag_code) >= 249 and float(diag_code) < 260:
        diag_category='diabetes'
    elif float(diag_code) >= 260 and float(diag_code) < 280:
        diag_category='nutritional, metabolic, immunity'
    elif float(diag_code) >= 280 and float(diag_code) < 290:
        diag_category='blood'
    elif float(diag_code) >= 290 and float(diag_code) < 320:
        diag_category='mental'
    elif float(diag_code) >= 320 and float(diag_code) <390:
        diag_category='nervous'
    elif float(diag_code) >= 390 and float(diag_code) < 460:
        diag_category='circulatory'
    elif float(diag_code) >= 460 and float(diag_code) < 520:
        diag_category='respiratory'
    elif float(diag_code) >= 520 and float(diag_code) < 580:
        diag_category='digestive'
    elif float(diag_code) >= 580 and float(diag_code) < 630:
        diag_category='genitourinary'
    elif float(diag_code) >= 630 and float(diag_code) < 680:
        diag_category='pregnancy'
    elif float(diag_code) >= 680 and float(diag_code) < 710:
        diag_category='skin'
    elif float(diag_code) >= 710 and float(diag_code) < 740:
        diag_category='musculoskeletal'
    elif float(diag_code) >= 740 and float(diag_code) < 760:
        diag_category='congenital'
    elif float(diag_code) >= 760 and float(diag_code) < 780:
        diag_category='perinatal'
    elif float(diag_code) >= 780 and float(diag_code) < 800:
        diag_category='symptoms'
    else: 
        diag_category='injury and poisoning'
    return diag_category

df['diag_2'] = df['diag_2'].replace('?', 0).apply(map_diag)

def function_circulatory(x):
    if x == 'circulatory':
        return 1
    else:
        return 0

df['circulatory'] = df['diag_2'].apply(function_circulatory)
diag_2_d=pd.get_dummies(df['diag_2'],prefix='diag2')
df=pd.concat([df,diag_2_d],axis=1)

df['readmitted'] = df.loc[:,('readmitted')].replace('NO', 0)
df['readmitted'] = df.loc[:,('readmitted')].replace(['>30','<30'], 1)

#creating new column that groups diseases related to diabetes
def not_related (x):
    if x in related_diseases:
        return 0
    else:
        return 1

related_diseases = ['diabetes','musculoskeletal',
                   'thyroid','skin','nutritional, metabolic, immunity']

df['not_diabetes_related'] = df['diag_2'].apply(not_related)

df.drop(['diag2_blood', 'diag2_circulatory', 'diag2_congenital', 'diag2_diabetes', 'diag2_digestive',
       'diag2_external injury and supplemental', 'diag2_genitourinary','diag2_infectious and parasitic', 
        'diag2_injury and poisoning', 'diag2_mental', 'diag2_musculoskeletal', 'diag2_neoplasms',
       'diag2_nervous', 'diag2_nutritional, metabolic, immunity','diag2_pregnancy', 'diag2_respiratory', 
        'diag2_skin', 'diag2_symptoms','diag2_thyroid'],axis=1,inplace=True)

df.drop(['diag_2'],axis=1,inplace=True)

In [7]:
df

Unnamed: 0,num_lab_procedures,num_medications,time_in_hospital,age,number_diagnoses,num_procedures,number_inpatient,number_outpatient,number_emergency,change,insulin,metformin,not_diabetes_related,circulatory,readmitted,Female
98539,2,13,2,9,9,0,1,0,0,0,1,0,0,0,1,1
77708,49,9,1,9,5,0,0,0,0,1,0,1,1,0,0,1
5192,2,20,4,4,5,0,2,0,0,0,1,0,1,0,1,1
98047,32,22,2,7,9,0,0,0,1,1,1,1,1,1,1,0
50057,31,15,1,6,4,5,0,0,0,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2823,62,7,2,5,3,0,0,0,0,0,1,0,0,0,1,1
6356,83,22,9,9,9,4,1,0,0,0,1,0,1,0,1,1
31695,30,20,3,7,4,2,0,2,0,0,0,1,0,0,0,1
553,36,5,4,8,5,0,0,0,0,0,1,0,1,1,1,1


In [8]:
# train data using xgboost
features = df.drop('readmitted',axis=1)
target = df.readmitted
X_train,X_test,y_train,y_test=train_test_split(features,target,test_size=0.20)
 
xg = XGBClassifier(objective="binary:logistic", learning_rate=0.1, n_estimators=150, max_depth=8,
                min_child_weight=3, gamma=0.2, subsample=0.9, colsample_bytree=0.7)
xg.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=3, missing=None, n_estimators=150, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)

In [20]:
def predict_readmission(data_dict):
        """
        Predicting the readmission status using input from user
        :param data_dict: data_dict of a patient
        :return: Prediction probability for readmission
        """
        
        # convert data_dict back into dataframe for preprocessing
        data = pd.DataFrame(data_dict, index=[1])
        
        # preprocess raw categorical data
        data['insulin'] = data['insulin'].replace(['Steady','Up','Down'],1)
        data['insulin'] = data['insulin'].replace('No', 0)
        
        data['change']=data.loc[:,('change')].replace('No', 0)
        data['change']=data.loc[:,('change')].replace('Ch', 1)

        for i in range(0,10):
            data['age'] = data['age'].replace('[' + str(i*10) + '-' + str(10*(i+1)) + ')', i+1)

        data = data[~data['gender'].str.contains('Unknown/Invalid')]
        
        data['gender']=data['gender'].replace('Male',0)
        data['gender']=data['gender'].replace('Female',1)
        data['Female'] = data['gender']
        data.drop(['gender'],axis=1,inplace=True)
        
        data['metformin'] = data['metformin'].replace(['Steady','Up','Down'],1)
        data['metformin'] = data['metformin'].replace('No', 0)
        
        #data['readmitted'] = data.loc[:,('readmitted')].replace('NO', 0)
        #data['readmitted'] = data.loc[:,('readmitted')].replace(['>30','<30'], 1)

        def map_diag(diag_code):
            """
            Mapping diagnosis ID code to disease/disorder
            :param diag_code: number
            :return: category for a diagnosis
            """
            if "V" in str(diag_code) or "E" in str(diag_code):
                diag_category='external injury and supplemental'
            elif float(diag_code) is 0:
                diag_category='N/A'
            elif float(diag_code) < 140:
                diag_category='infectious and parasitic'
            elif float(diag_code) >= 140 and float(diag_code) < 240:
                diag_category='neoplasms'
            elif float(diag_code) >= 240 and float(diag_code) < 249:
                diag_category='thyroid'
            elif float(diag_code) >= 249 and float(diag_code) < 260:
                diag_category='diabetes'
            elif float(diag_code) >= 260 and float(diag_code) < 280:
                diag_category='nutritional, metabolic, immunity'
            elif float(diag_code) >= 280 and float(diag_code) < 290:
                diag_category='blood'
            elif float(diag_code) >= 290 and float(diag_code) < 320:
                diag_category='mental'
            elif float(diag_code) >= 320 and float(diag_code) <390:
                diag_category='nervous'
            elif float(diag_code) >= 390 and float(diag_code) < 460:
                diag_category='circulatory'
            elif float(diag_code) >= 460 and float(diag_code) < 520:
                diag_category='respiratory'
            elif float(diag_code) >= 520 and float(diag_code) < 580:
                diag_category='digestive'
            elif float(diag_code) >= 580 and float(diag_code) < 630:
                diag_category='genitourinary'
            elif float(diag_code) >= 630 and float(diag_code) < 680:
                diag_category='pregnancy'
            elif float(diag_code) >= 680 and float(diag_code) < 710:
                diag_category='skin'
            elif float(diag_code) >= 710 and float(diag_code) < 740:
                diag_category='musculoskeletal'
            elif float(diag_code) >= 740 and float(diag_code) < 760:
                diag_category='congenital'
            elif float(diag_code) >= 760 and float(diag_code) < 780:
                diag_category='perinatal'
            elif float(diag_code) >= 780 and float(diag_code) < 800:
                diag_category='symptoms'
            else: 
                diag_category='injury and poisoning'
            return diag_category

        data['diag_2'] = data['diag_2'].replace('?', 0).apply(map_diag)

        def function_circulatory(x):
            if x == 'circulatory':
                return 1
            else:
                return 0

        data['circulatory'] = data['diag_2'].apply(function_circulatory)

        #creating new column that groups diseases related to diabetes
        related_diseases = ['diag2_diabetes','diag2_musculoskeletal',
                           'diag2_thyroid','diag2_skin','diag2_nutritional, metabolic, immunity']

        def not_related (x):
            if x in related_diseases:
                return 0
            else:
                return 1

        data['not_diabetes_related'] = data['diag_2'].apply(not_related)

        data.drop(['diag_2'],axis=1,inplace=True)
        
        prediction = xg.predict_proba(data)
        return prediction * 100

In [10]:
# removing 'readmitted' column in holdout set
test = hold.drop('readmitted', axis=1)
test.head()

Unnamed: 0,num_lab_procedures,num_medications,time_in_hospital,age,number_diagnoses,num_procedures,number_inpatient,number_outpatient,number_emergency,gender,change,insulin,metformin,not_diabetes_related,diag_2,circulatory
98539,2,13,2,[80-90),9,0,1,0,0,Female,No,Steady,No,,707,
77708,49,9,1,[80-90),5,0,0,0,0,Female,Ch,No,Steady,,599,
5192,2,20,4,[30-40),5,0,2,0,0,Female,No,Steady,No,,305,
98047,32,22,2,[60-70),9,0,0,0,1,Male,Ch,Steady,Steady,,428,
50057,31,15,1,[50-60),4,5,0,0,0,Female,No,No,No,,403,


In [16]:
hold.sample(5)

Unnamed: 0,num_lab_procedures,num_medications,time_in_hospital,age,number_diagnoses,num_procedures,number_inpatient,number_outpatient,number_emergency,gender,change,insulin,metformin,not_diabetes_related,diag_2,circulatory,readmitted
97357,42,13,2,[60-70),6,0,1,1,0,Male,No,No,No,,197,,<30
49304,67,30,13,[70-80),9,3,0,0,0,Female,No,No,No,,491,,NO
38611,31,54,14,[70-80),9,6,0,0,0,Female,Ch,Up,No,,780,,NO
7808,67,9,3,[40-50),9,0,0,0,0,Male,No,No,No,,250.41,,>30
66605,64,20,6,[50-60),9,0,0,2,0,Female,No,No,No,,V42,,>30


In [12]:
# showing readmission true answer
example_hold = hold.loc[70148,:].to_dict()
example_hold

{'num_lab_procedures': 9,
 'num_medications': 27,
 'time_in_hospital': 4,
 'age': '[70-80)',
 'number_diagnoses': 7,
 'num_procedures': 2,
 'number_inpatient': 0,
 'number_outpatient': 0,
 'number_emergency': 0,
 'gender': 'Female',
 'change': 'No',
 'insulin': 'No',
 'metformin': 'No',
 'not_diabetes_related': nan,
 'diag_2': '427',
 'circulatory': nan,
 'readmitted': 'NO'}

In [13]:
example_test = test.loc[70148,:].to_dict()

In [21]:
# test function to see if it matches true answer
print(predict_readmission(example_test))

[[66.259636 33.740364]]
