## Diabetes

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from tqdm import tqdm

df = pd.read_csv("diabetic_data.csv", dtype='category').iloc[:100,2:]
df.shape

(100, 48)

In [2]:
df.head(5)

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),?,1,1,7,3,?,?,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),?,1,1,7,2,?,?,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),?,1,1,7,1,?,?,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
to_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
         'num_medications', 'number_outpatient', 'number_emergency',
         'number_inpatient', 'number_diagnoses']

### remove missing

In [4]:
df = df.drop(columns=['weight', 'payer_code', 'medical_specialty'])
df.head(2)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30


### split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df[df.columns[-1:]] , random_state = 0)

In [6]:
X_train.shape

(75, 44)

In [7]:
X_test.shape

(25, 44)

In [8]:
y_train.dtypes

readmitted    category
dtype: object

In [9]:
X_train.dtypes

race                        category
gender                      category
age                         category
admission_type_id           category
discharge_disposition_id    category
admission_source_id         category
time_in_hospital            category
num_lab_procedures          category
num_procedures              category
num_medications             category
number_outpatient           category
number_emergency            category
number_inpatient            category
diag_1                      category
diag_2                      category
diag_3                      category
number_diagnoses            category
max_glu_serum               category
A1Cresult                   category
metformin                   category
repaglinide                 category
nateglinide                 category
chlorpropamide              category
glimepiride                 category
acetohexamide               category
glipizide                   category
glyburide                   category
t

### scaling

In [10]:
scaler = MinMaxScaler()
for n in to_num:
    X_train[n] = X_train[n].astype('int')
    X_test[n] = X_test[n].astype('int')
    X_train[n] = scaler.fit_transform(X_train[n].reshape(-1, 1))
    X_test[n] = scaler.fit_transform(X_test[n].reshape(-1, 1))

  """
  


In [11]:
X_features = to_num
for c in X_train.columns:
    if X_train[c].dtypes != 'float64':
        X_train[c+'_cat'] = X_train[c].cat.codes        
        X_test[c+'_cat'] = X_test[c].cat.codes
        X_features += [c+'_cat']

In [12]:
y_train['readmitted' + '_cat'] = y_train['readmitted'].cat.codes        
y_test['readmitted' + '_cat'] = y_test['readmitted'].cat.codes        

In [13]:
y_train.columns

Index(['readmitted', 'readmitted_cat'], dtype='object')

In [14]:
X_train[X_features].dtypes

time_in_hospital                float64
num_lab_procedures              float64
num_procedures                  float64
num_medications                 float64
number_outpatient               float64
number_emergency                float64
number_inpatient                float64
number_diagnoses                float64
race_cat                           int8
gender_cat                         int8
age_cat                            int8
admission_type_id_cat              int8
discharge_disposition_id_cat       int8
admission_source_id_cat            int8
diag_1_cat                        int16
diag_2_cat                        int16
diag_3_cat                        int16
max_glu_serum_cat                  int8
A1Cresult_cat                      int8
metformin_cat                      int8
repaglinide_cat                    int8
nateglinide_cat                    int8
chlorpropamide_cat                 int8
glimepiride_cat                    int8
acetohexamide_cat                  int8


In [15]:
X_train[X_features].head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_cat,gender_cat,...,examide_cat,citoglipton_cat,insulin_cat,glyburide-metformin_cat,glipizide-metformin_cat,glimepiride-pioglitazone_cat,metformin-rosiglitazone_cat,metformin-pioglitazone_cat,change_cat,diabetesMed_cat
48,0.416667,0.863014,0.833333,0.692308,0.0,0.0,0.0,1.0,3,1,...,0,0,3,1,0,0,0,0,0,1
6,0.25,0.931507,0.166667,0.769231,0.0,0.0,0.0,0.75,3,1,...,0,0,2,1,0,0,0,0,0,1


In [16]:
X_train[X_features].describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_cat,gender_cat,...,examide_cat,citoglipton_cat,insulin_cat,glyburide-metformin_cat,glipizide-metformin_cat,glimepiride-pioglitazone_cat,metformin-rosiglitazone_cat,metformin-pioglitazone_cat,change_cat,diabetesMed_cat
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,...,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
mean,0.324444,0.60274,0.253333,0.534872,0.013333,0.026667,0.106667,0.76,2.413333,0.48,...,0.0,0.0,1.493333,1.0,0.0,0.0,0.0,0.0,0.44,0.92
std,0.253198,0.217097,0.270357,0.206752,0.11547,0.162192,0.236719,0.265372,1.186561,0.502964,...,0.0,0.0,0.963851,0.0,0.0,0.0,0.0,0.0,0.49973,0.27312
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.166667,0.458904,0.0,0.384615,0.0,0.0,0.0,0.625,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.25,0.616438,0.166667,0.576923,0.0,0.0,0.0,0.875,3.0,0.0,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.5,0.767123,0.333333,0.692308,0.0,0.0,0.0,1.0,3.0,1.0,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,...,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


## RBF kernel SVM with min max scaling

In [23]:
clf = SVC(C=10).fit(X_train[X_features], y_train['readmitted_cat'])

print('training accuracy: {:.2f}'.format(clf.score(X_train[X_features], y_train['readmitted_cat'])))
print('test accuracy: {:.2f}'.format(clf.score(X_test[X_features], y_test['readmitted_cat'])))

training accuracy: 1.00
test accuracy: 0.44


### optimize over accuracy

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

clf = SVC()
grid_values = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'kernel': ['rbf']},
 ]

grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)
grid_clf_acc.fit(X_train[X_features], y_train['readmitted_cat'])
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test[X_features]) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Grid best score (accuracy):  0.6133333333333333


In [27]:
y_decision_fn_scores_acc 

array([[-0.14198852,  0.95709511,  2.18489341],
       [-0.29439909,  2.30454117,  0.98985793],
       [-0.23067366,  2.1446654 ,  1.08600826],
       [ 2.03050349,  1.01198494, -0.04248843],
       [-0.470289  ,  2.5       ,  0.970289  ],
       [-0.2306963 ,  2.14459114,  1.08610516],
       [-0.36584018,  2.39851604,  0.96732415],
       [-0.23067366,  2.1446654 ,  1.08600826],
       [-0.23124447,  2.1469529 ,  1.08429157],
       [-0.27111444,  1.00493381,  2.26618063],
       [-0.23691429,  2.12265137,  1.11426292],
       [-0.25662168,  2.248443  ,  1.00817867],
       [-0.26419365,  1.02876354,  2.23543011],
       [-0.23576972,  2.3309608 ,  0.90480892],
       [-0.18410351,  2.13806134,  1.04604218],
       [-0.23067338,  2.14466479,  1.08600858],
       [-0.26297771,  2.25103883,  1.01193888],
       [-0.23097299,  2.14586439,  1.0851086 ],
       [-0.23545723,  2.12811103,  1.1073462 ],
       [-0.38628789,  1.14049775,  2.24579014],
       [-0.22672427,  2.15244918,  1.074

In [None]:
# precision recall curve only for binary class