In [1]:
%matplotlib inline

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [3]:
diabetes_data = pd.read_csv('data/dataset_diabetes/diabetic_data.csv')

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [6]:
diabetes_data.shape

(101766, 50)

In [7]:
diabetes_target = diabetes_data['readmitted']
diabetes_attributes = diabetes_data.drop('readmitted', axis = 1)

In [8]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [9]:
diabetes_attributes.shape

(101766, 2472)

In [10]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [12]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(
    diabetes_attributes_scaled,
    diabetes_target, 
    test_size = 0.2,
    random_state = 42
)

In [14]:
tree = DecisionTreeClassifier(max_depth = 3)
tree.fit(attributes_train, targets_train)

DecisionTreeClassifier(max_depth=3)

In [15]:
tree.score(attributes_train, targets_train)

0.5690438755957353

In [16]:
tree.score(attributes_test, targets_test)

0.5670629851626215

In [17]:
tree.feature_importances_

array([0.18220363, 0.11395714, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [20]:
forest = RandomForestClassifier(n_estimators = 20, max_depth  = 3)

In [21]:
forest.fit(attributes_train, targets_train)

RandomForestClassifier(max_depth=3, n_estimators=20)

In [22]:
forest.score(attributes_train, targets_train)

0.5393799439886012

In [23]:
forest.score(attributes_test, targets_test)

0.5380760538469097

In [26]:
ada_boost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 1), n_estimators = 20)

In [27]:
ada_boost.fit(attributes_train, targets_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   n_estimators=20)

In [28]:
ada_boost.score(attributes_train, targets_train)

0.5903060973812214

In [29]:
ada_boost.score(attributes_test, targets_test)

0.587452097867741

In [30]:
ada_boost.estimator_weights_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [32]:
sorted(ada_boost.feature_importances_)[-10:]

[0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.15, 0.2, 0.3]