# Diabetes in 130 US hospitals for the years 1999 to 2008

## Checking algorithms

 * [Feature selection](#Feature-selection)
   - [Correlation feature selection](#Correlation-feature-selection)
   - [Recursive feature elimination](#Recursive-feature-elimination)
 * [Logistic regression](#Logistic-regression)
   - [Using statsmodels](#Using-statsmodels)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [3]:
df_encoded = pd.read_csv('data/df_encoded.csv', index_col=0)

In [4]:
df_encoded.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,race=Caucasian,gender=Female,age=[10-20),admission_type_id,discharge_disposition_id,admission_source_id,max_glu_serum=None,A1Cresult=None,metformin=No,repaglinide=No,nateglinide=No,chlorpropamide=No,glimepiride=No,acetohexamide=No,glipizide=No,glyburide=No,tolbutamide=No,pioglitazone=No,rosiglitazone=No,acarbose=No,miglitol=No,troglitazone=No,tolazamide=No,examide=No,citoglipton=No,insulin=Up,glyburide-metformin=No,glipizide-metformin=No,glimepiride-pioglitazone=No,metformin-rosiglitazone=No,metformin-pioglitazone=No,change=Ch,diabetesMed=Yes,diagnosis=others,race=AfricanAmerican,age=[20-30),glipizide=Steady,insulin=No,change=No,gender=Male,age=[40-50),insulin=Steady,diagnosis=neoplasms,age=[50-60),diagnosis=circulatory,age=[60-70),metformin=Steady,glimepiride=Steady,age=[70-80),glyburide=Steady,age=[80-90),age=[90-100),rosiglitazone=Steady,diagnosis=diabetes,glyburide=Up,repaglinide=Up,insulin=Down,diagnosis=respiratory,diagnosis=injury,diabetesMed=No,race=Other,A1Cresult=>7,acarbose=Steady,diagnosis=genitourinary,metformin=Up,troglitazone=Steady,diagnosis=musculoskeletal,diagnosis=digestive,A1Cresult=>8,age=[30-40),A1Cresult=Norm,glipizide=Down,repaglinide=Steady,glimepiride=Up,tolazamide=Steady,glipizide=Up,glyburide=Down,race=Asian,tolbutamide=Steady,rosiglitazone=Up,chlorpropamide=Steady,pioglitazone=Steady,race=Hispanic,age=[0-10),glimepiride=Down,metformin=Down,acarbose=Up,rosiglitazone=Down,pioglitazone=Up,glyburide-metformin=Steady,pioglitazone=Down,nateglinide=Steady,chlorpropamide=Down,chlorpropamide=Up,repaglinide=Down,glyburide-metformin=Down,glyburide-metformin=Up,nateglinide=Down,miglitol=Steady,acetohexamide=Steady,miglitol=Down,nateglinide=Up,glipizide-metformin=Steady,max_glu_serum=>300,max_glu_serum=Norm,miglitol=Up,max_glu_serum=>200,metformin-pioglitazone=Steady
0,3,59,0,18,0,0,0,9,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,2,11,5,13,2,0,1,6,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1,51,0,8,0,0,0,5,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,3,31,6,16,0,0,0,9,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,4,70,1,21,0,0,0,7,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [5]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55122 entries, 0 to 55121
Columns: 117 entries, time_in_hospital to metformin-pioglitazone=Steady
dtypes: float64(108), int64(9)
memory usage: 49.6 MB


In [6]:
df_encoded.shape

(55122, 117)

# Feature selection

## Correlation feature selection

In [78]:
filter_ = df_encoded.corr()['readmitted'] < 1
corr = df_encoded.corr()['readmitted'][filter_].abs()
corr.sort_values(ascending=False, inplace=True)

In [79]:
print(corr[:20])

number_inpatient               0.144255
number_diagnoses               0.111798
number_emergency               0.079305
number_outpatient              0.072000
glimepiride-pioglitazone=No    0.061285
metformin-rosiglitazone=No     0.061285
time_in_hospital               0.059071
repaglinide=No                 0.049879
nateglinide=No                 0.048551
gender=Male                    0.046938
rosiglitazone=No               0.046364
acarbose=No                    0.043014
num_lab_procedures             0.039928
tolazamide=No                  0.037295
troglitazone=No                0.037295
nateglinide=Steady             0.036957
acarbose=Steady                0.034390
diagnosis=genitourinary        0.034372
tolbutamide=No                 0.033871
num_medications                0.033615
Name: readmitted, dtype: float64


## Recursive feature elimination

We're going to perform [feature ranking with recursive feature elimination](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html), using the RFE class in scikit-learn. This procedure was taken from this blog post ['Building A Logistic Regression in Python, Step by Step'](https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8). As the blog post mentions, this technique "is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features."

In [7]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [8]:
X = df_encoded.drop('readmitted', axis=1)
y = df_encoded.loc[:, 'readmitted'].values.ravel()

The `readmitted` output column has a fairly similar amount of readmission and non-readmission cases.

In [9]:
print(f'Ratio of readmission to total: {(y == 1).sum()/y.shape[0]:.2f}')
print(f'Ratio of non-readmission to total: {(y == 0).sum()/y.shape[0]:.2f}')

Ratio of readmission to total: 0.41
Ratio of non-readmission to total: 0.59


In [10]:
lg = LogisticRegression(solver='newton-cg')
rfe = RFE(lg, 20)
rfe = rfe.fit(X, y)

In [31]:
print(f'Number of reduced features: {rfe.n_features_}')
print(f'List of reduced features: {list(X.columns.values[rfe.support_])}')

Number of reduced features: 20
List of reduced features: ['number_inpatient', 'discharge_disposition_id', 'A1Cresult=None', 'metformin=No', 'chlorpropamide=No', 'acetohexamide=No', 'glipizide=No', 'glipizide=Steady', 'age=[70-80)', 'age=[80-90)', 'glyburide=Up', 'repaglinide=Up', 'insulin=Down', 'diagnosis=musculoskeletal', 'diagnosis=digestive', 'age=[30-40)', 'tolbutamide=Steady', 'metformin=Down', 'miglitol=Steady', 'miglitol=Up']


In [18]:
X_mod = X.loc[:, list(X.columns.values[rfe.support_])]

In [19]:
print(X_mod.shape)
print(y.shape)

(55122, 20)
(55122,)


# Logistic regression

## Using statsmodels

In [37]:
import statsmodels.api as sm
logit_model = sm.Logit(y, X_mod)
result = logit_model.fit_regularized(method='l1')

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6639892874134398
            Iterations: 214
            Function evaluations: 214
            Gradient evaluations: 214


In [38]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                55122
Model:                          Logit   Df Residuals:                    55102
Method:                           MLE   Df Model:                           19
Date:                Tue, 16 Jul 2019   Pseudo R-squ.:                 0.01947
Time:                        15:09:58   Log-Likelihood:                -36600.
converged:                       True   LL-Null:                       -37327.
Covariance Type:            nonrobust   LLR p-value:                4.209e-297
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
number_inpatient              0.5456      0.017     31.354      0.000       0.512       0.580
discharge_disposition_id     -0.5746      0.167     -3.443      0.001      -0.902     

In [39]:
print(result.summary2())

                                  Results: Logit
Model:                     Logit                 Pseudo R-squared:      0.019      
Dependent Variable:        y                     AIC:                   73240.8350 
Date:                      2019-07-16 15:10      BIC:                   73419.1811 
No. Observations:          55122                 Log-Likelihood:        -36600.    
Df Model:                  19                    LL-Null:               -37327.    
Df Residuals:              55102                 LLR p-value:           4.2087e-297
Converged:                 1.0000                Scale:                 1.0000     
No. Iterations:            214.0000                                                
-----------------------------------------------------------------------------------
                           Coef.    Std.Err.     z    P>|z|     [0.025     0.975]  
-----------------------------------------------------------------------------------
number_inpatient           

## Using scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split