In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

df = pd.read_csv("dataset_diabetes/diabetes_data_preprocessed_original.csv")

df.shape
df.info()

df['readmitted'] = df['readmitted'].replace('>30', 2)
df['readmitted'] = df['readmitted'].replace('<30', 1)
df['readmitted'] = df['readmitted'].replace('NO', 0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91023 entries, 0 to 91022
Data columns (total 43 columns):
Unnamed: 0                  91023 non-null int64
encounter_id                91023 non-null int64
patient_nbr                 91023 non-null int64
race                        91023 non-null float64
gender                      91023 non-null int64
age                         91023 non-null float64
admission_type_id           91023 non-null int64
discharge_disposition_id    91023 non-null int64
admission_source_id         91023 non-null int64
time_in_hospital            91023 non-null float64
num_medications             91023 non-null float64
diag_1                      91023 non-null float64
diag_2                      91023 non-null float64
diag_3                      91023 non-null float64
number_diagnoses            91023 non-null float64
max_glu_serum               91023 non-null int64
A1Cresult                   91023 non-null int64
metformin                   91023 non-null

In [2]:
feature_set = ['encounter_id',                
'patient_nbr',                 
'race',                        
'gender',                      
'age',                         
'admission_type_id',           
'discharge_disposition_id',    
'admission_source_id',         
'time_in_hospital',            
'num_medications',             
'diag_1',                      
'diag_2',                      
'diag_3',                      
'number_diagnoses',            
'max_glu_serum',               
'A1Cresult',                   
'metformin',                   
'repaglinide',                 
'nateglinide',                 
'chlorpropamide',              
'glimepiride',                 
'acetohexamide',               
'glipizide',                   
'glyburide',                   
'tolbutamide',                 
'pioglitazone',                
'rosiglitazone',               
'acarbose',                    
'miglitol',                    
'troglitazone',                
'tolazamide',                  
'insulin',                     
'glyburide-metformin',         
'glipizide-metformin',         
'glimepiride-pioglitazone',    
'metformin-rosiglitazone',     
'metformin-pioglitazone',      
'change',                      
'diabetesMed',                 
'number_services',             
'num_total_procedures']

the_input = df[feature_set]
the_output = df['readmitted']

from imblearn.over_sampling import SMOTE
from collections import Counter

smt = SMOTE(random_state=20)
train_input_new, train_output_new = smt.fit_sample(the_input, the_output)
train_input_new = pd.DataFrame(train_input_new, columns = list(the_input.columns))

X_train, X_test, Y_train, Y_test = train_test_split(train_input_new, train_output_new, test_size=0.20, random_state=0)

In [3]:
print(X_train, X_test, Y_train, Y_test)
print(len(X_train), len(X_test), len(Y_train), len(Y_test))

        encounter_id   patient_nbr      race    gender       age  \
73051   2.513316e+08  9.052808e+07  0.174986  0.000000 -1.306313   
12069   5.363576e+07  4.182106e+07  0.174986  0.000000  1.201972   
2374    1.722436e+07  1.130331e+08  0.174986  0.000000  0.574901   
59432   1.844201e+08  8.997041e+07  0.174986  1.000000  1.201972   
137638  7.735902e+07  4.865508e+06 -0.063156  1.000000 -0.770271   
120547  5.995989e+07  6.550024e+05 -0.158706  0.000000 -0.796098   
130754  1.008520e+08  5.983323e+07  0.174986  0.657987 -0.068498   
17144   6.846376e+07  3.802266e+06  0.174986  0.000000 -0.679242   
9687    4.556339e+07  2.445660e+05  0.174986  0.000000 -0.679242   
141814  2.698650e+08  8.558346e+07  0.174986  1.000000  0.713150   
90579   4.366869e+08  1.365454e+08  0.174986  1.000000 -0.679242   
112511  7.919471e+07  2.812667e+07  0.174986  0.787770 -0.279991   
82955   3.219662e+08  6.823030e+07  0.174986  0.000000 -0.052170   
29786   1.057375e+08  2.347804e+07  0.174986  1.

In [4]:
learning_rates = np.arange(0.01,0.11,0.01)
colsample_bytrees = np.arange(0.3,1.1,0.1)
gammas = np.arange(0,3,1)
subsamples = np.arange(0.8,1.0,0.1)
subsamples = np.concatenate((subsamples, [1.]))

print(learning_rates)
print(colsample_bytrees)
print(gammas)
print(subsamples)

for i in learning_rates:
    for j in colsample_bytrees:
        for k in gammas:
            for l in subsamples:
                print (i,j,k,l)

[0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 ]
[0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
[0 1 2]
[0.8 0.9 1. ]
0.01 0.3 0 0.8
0.01 0.3 0 0.9
0.01 0.3 0 1.0
0.01 0.3 1 0.8
0.01 0.3 1 0.9
0.01 0.3 1 1.0
0.01 0.3 2 0.8
0.01 0.3 2 0.9
0.01 0.3 2 1.0
0.01 0.4 0 0.8
0.01 0.4 0 0.9
0.01 0.4 0 1.0
0.01 0.4 1 0.8
0.01 0.4 1 0.9
0.01 0.4 1 1.0
0.01 0.4 2 0.8
0.01 0.4 2 0.9
0.01 0.4 2 1.0
0.01 0.5 0 0.8
0.01 0.5 0 0.9
0.01 0.5 0 1.0
0.01 0.5 1 0.8
0.01 0.5 1 0.9
0.01 0.5 1 1.0
0.01 0.5 2 0.8
0.01 0.5 2 0.9
0.01 0.5 2 1.0
0.01 0.6000000000000001 0 0.8
0.01 0.6000000000000001 0 0.9
0.01 0.6000000000000001 0 1.0
0.01 0.6000000000000001 1 0.8
0.01 0.6000000000000001 1 0.9
0.01 0.6000000000000001 1 1.0
0.01 0.6000000000000001 2 0.8
0.01 0.6000000000000001 2 0.9
0.01 0.6000000000000001 2 1.0
0.01 0.7000000000000002 0 0.8
0.01 0.7000000000000002 0 0.9
0.01 0.7000000000000002 0 1.0
0.01 0.7000000000000002 1 0.8
0.01 0.7000000000000002 1 0.9
0.01 0.7000000000000002 1 1.0
0.01 0.7000000000000002 2 0.8
0.01 0.

0.08 1.0000000000000002 0 0.8
0.08 1.0000000000000002 0 0.9
0.08 1.0000000000000002 0 1.0
0.08 1.0000000000000002 1 0.8
0.08 1.0000000000000002 1 0.9
0.08 1.0000000000000002 1 1.0
0.08 1.0000000000000002 2 0.8
0.08 1.0000000000000002 2 0.9
0.08 1.0000000000000002 2 1.0
0.09 0.3 0 0.8
0.09 0.3 0 0.9
0.09 0.3 0 1.0
0.09 0.3 1 0.8
0.09 0.3 1 0.9
0.09 0.3 1 1.0
0.09 0.3 2 0.8
0.09 0.3 2 0.9
0.09 0.3 2 1.0
0.09 0.4 0 0.8
0.09 0.4 0 0.9
0.09 0.4 0 1.0
0.09 0.4 1 0.8
0.09 0.4 1 0.9
0.09 0.4 1 1.0
0.09 0.4 2 0.8
0.09 0.4 2 0.9
0.09 0.4 2 1.0
0.09 0.5 0 0.8
0.09 0.5 0 0.9
0.09 0.5 0 1.0
0.09 0.5 1 0.8
0.09 0.5 1 0.9
0.09 0.5 1 1.0
0.09 0.5 2 0.8
0.09 0.5 2 0.9
0.09 0.5 2 1.0
0.09 0.6000000000000001 0 0.8
0.09 0.6000000000000001 0 0.9
0.09 0.6000000000000001 0 1.0
0.09 0.6000000000000001 1 0.8
0.09 0.6000000000000001 1 0.9
0.09 0.6000000000000001 1 1.0
0.09 0.6000000000000001 2 0.8
0.09 0.6000000000000001 2 0.9
0.09 0.6000000000000001 2 1.0
0.09 0.7000000000000002 0 0.8
0.09 0.7000000000000002 0

In [None]:
learning_rates = np.arange(0.01,0.11,0.01)
colsample_bytrees = np.arange(0.3,1.1,0.1)
gammas = np.arange(0,3,1)
subsamples = np.arange(0.8,1.0,0.1)
subsamples = np.concatenate((subsamples, [1.]))

for i in learning_rates:
    for j in colsample_bytrees:
        for k in subsamples:
            for l in gammas:
                xg_reg = xgb.XGBClassifier(
                          learning_rate=i,  
                          colsample_bytree = j,
                          subsample = k,
                          gamma=l)

                # print("Cross Validation score: ", np.mean(cross_val_score(xg_reg, X_train, Y_train, cv=10)))
                xg_reg.fit(X_train,Y_train)

                Y_test_predict = xg_reg.predict(X_test)
                print(accuracy_score(Y_test, Y_test_predict))

0.6048543018920038
