## <font color=blue>Diabetes dataset age 70-100<font>

[Baseline Categorical](#SVM)

[One Hot](#hot)

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from tqdm import tqdm

In [2]:
def plot_confusion(clf, title, X_test, y_test):
    svm_predicted_mc = clf.predict(X_test)
    confusion_mc = confusion_matrix(y_test, svm_predicted_mc)
    df_cm = pd.DataFrame(confusion_mc, 
                         index = [i for i in range(0,3)], columns = [i for i in range(0,3)])

    plt.figure(figsize=(6,4))
    ax_ticks= ['<30', '>30', 'NO']
    sns.heatmap(df_cm, annot=True, xticklabels=ax_ticks, yticklabels=ax_ticks, fmt='g')
    plt.title(title + '\nAccuracy:{0:.3f}'.format(accuracy_score(y_test, 
                                                                           svm_predicted_mc)))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    print('Micro-averaged precision = {:.2f} (treat instances equally)'
      .format(precision_score(y_test, svm_predicted_mc, average = 'micro')))
    print('Macro-averaged precision = {:.2f} (treat classes equally)'
      .format(precision_score(y_test, svm_predicted_mc, average = 'macro')))
    print('Micro-averaged f1 = {:.2f} (treat instances equally)'
          .format(f1_score(y_test, svm_predicted_mc, average = 'micro')))
    print('Macro-averaged f1 = {:.2f} (treat classes equally)'
          .format(f1_score(y_test, svm_predicted_mc, average = 'macro')))
    print(classification_report(y_test, svm_predicted_mc, target_names=ax_ticks))

In [3]:
df = pd.read_csv("diabetic_data.csv", dtype='category').iloc[:,2:]
df.shape

(101766, 48)

In [4]:
df['age'].head(2)

0     [0-10)
1    [10-20)
Name: age, dtype: category
Categories (10, object): [[0-10), [10-20), [20-30), [30-40), ..., [60-70), [70-80), [80-90), [90-100)]

In [5]:
df = df.loc[(df['age'] == '[70-80)') | (df['age'] == '[80-90)')
           | (df['age'] == '[90-100)')]

In [6]:
df.head(2)

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
7,Caucasian,Male,[70-80),?,1,1,7,5,?,?,...,No,No,No,No,No,No,No,No,Yes,>30
8,Caucasian,Female,[80-90),?,2,1,4,13,?,?,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [7]:
df.describe()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,46058,46058,46058,46058,46058,46058,46058,46058,46058,46058,...,46058,46058,46058,46058,46058,46058,46058,46058,46058,46058
unique,6,3,3,9,8,25,17,14,17,57,...,1,4,4,2,1,2,1,2,2,3
top,Caucasian,Female,[70-80),?,1,1,7,3,MC,?,...,No,No,No,No,No,No,No,No,Yes,NO
freq,37688,26503,26068,44489,25117,20841,26737,7948,22529,23400,...,46058,23042,45777,46051,46058,46056,46058,25847,35049,24095


In [8]:
def show_unique(dataF):
    for c in dataF.columns:
        print(c, dataF[c].unique())
        print('*'*50)

show_unique(df)

race [Caucasian, AfricanAmerican, ?, Hispanic, Other, Asian]
Categories (6, object): [Caucasian, AfricanAmerican, ?, Hispanic, Other, Asian]
**************************************************
gender [Male, Female, Unknown/Invalid]
Categories (3, object): [Male, Female, Unknown/Invalid]
**************************************************
age [[70-80), [80-90), [90-100)]
Categories (3, object): [[70-80), [80-90), [90-100)]
**************************************************
weight [?, [50-75), [75-100), [100-125), [25-50), [0-25), [125-150), [150-175), [175-200)]
Categories (9, object): [?, [50-75), [75-100), [100-125), ..., [0-25), [125-150), [150-175), [175-200)]
**************************************************
admission_type_id [1, 2, 3, 6, 4, 5, 8, 7]
Categories (8, object): [1, 2, 3, 6, 4, 5, 8, 7]
**************************************************
discharge_disposition_id [1, 3, 6, 2, 5, ..., 15, 28, 24, 19, 27]
Length: 25
Categories (25, object): [1, 3, 6, 2, ..., 28, 24, 19, 27]


### <font color =blue>1. remove columns with missing data</font>

In [9]:
df = df.drop(columns=['weight', 'payer_code', 'medical_specialty'])
df.head(2)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
7,Caucasian,Male,[70-80),1,1,7,5,73,0,12,...,No,No,No,No,No,No,No,No,Yes,>30
8,Caucasian,Female,[80-90),2,1,4,13,68,2,28,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


### <font color=blue> 2. remove incomplete columns and rows </font>

In [10]:
# drop missing values
df = df.replace('?', np.nan)
df = df.replace('Unknown/Invalid', np.nan)
df = df.dropna()
df.shape

(44724, 45)

In [11]:
df = df.drop(columns= df.columns[df.nunique() == 1])

In [12]:
show_unique(df)

race [Caucasian, AfricanAmerican, Hispanic, Other, Asian]
Categories (5, object): [Caucasian, AfricanAmerican, Hispanic, Other, Asian]
**************************************************
gender [Male, Female]
Categories (2, object): [Male, Female]
**************************************************
age [[70-80), [80-90), [90-100)]
Categories (3, object): [[70-80), [80-90), [90-100)]
**************************************************
admission_type_id [1, 2, 3, 6, 4, 5, 8, 7]
Categories (8, object): [1, 2, 3, 6, 4, 5, 8, 7]
**************************************************
discharge_disposition_id [1, 3, 6, 2, 5, ..., 15, 28, 24, 19, 27]
Length: 25
Categories (25, object): [1, 3, 6, 2, ..., 28, 24, 19, 27]
**************************************************
admission_source_id [7, 4, 1, 2, 5, ..., 14, 11, 22, 25, 13]
Length: 17
Categories (17, object): [7, 4, 1, 2, ..., 11, 22, 25, 13]
**************************************************
time_in_hospital [5, 13, 12, 10, 2, ..., 7, 9, 4, 14,

### <font color = blue>3. categorical variables</font>

In [13]:
to_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
         'num_medications', 'number_outpatient', 'number_emergency',
         'number_inpatient', 'number_diagnoses']

to_cat_codes = list(set(df.columns) - set(to_num))

In [14]:
X_features = list(to_num)
for c in to_cat_codes:
    df[c+'_cat'] = df[c].cat.codes        
    X_features += [c+'_cat']
    
X_features.remove('readmitted_cat')
X_features

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'discharge_disposition_id_cat',
 'diag_3_cat',
 'gender_cat',
 'diag_2_cat',
 'chlorpropamide_cat',
 'troglitazone_cat',
 'glipizide_cat',
 'nateglinide_cat',
 'insulin_cat',
 'admission_source_id_cat',
 'diabetesMed_cat',
 'admission_type_id_cat',
 'tolbutamide_cat',
 'A1Cresult_cat',
 'repaglinide_cat',
 'acarbose_cat',
 'glipizide-metformin_cat',
 'diag_1_cat',
 'pioglitazone_cat',
 'age_cat',
 'metformin_cat',
 'rosiglitazone_cat',
 'glyburide_cat',
 'acetohexamide_cat',
 'race_cat',
 'tolazamide_cat',
 'max_glu_serum_cat',
 'change_cat',
 'miglitol_cat',
 'glyburide-metformin_cat',
 'glimepiride_cat']

In [15]:
df['readmitted'].head(11)

7     >30
8      NO
9      NO
13     NO
18    >30
22     NO
23    >30
24     NO
26     NO
29    >30
31    >30
Name: readmitted, dtype: category
Categories (3, object): [<30, >30, NO]

In [16]:
df['readmitted_cat'].head(11)

7     1
8     2
9     2
13    2
18    1
22    2
23    1
24    2
26    2
29    1
31    1
Name: readmitted_cat, dtype: int8

#### <font color=red>Target Mapping: < 30 = 0, >30 = 1, NO = 2, </font>

### <font color=blue>4. split</font>

In [17]:
for n in to_num:
    df[n] = df[n].astype('int')

df[to_num].dtypes

time_in_hospital      int64
num_lab_procedures    int64
num_procedures        int64
num_medications       int64
number_outpatient     int64
number_emergency      int64
number_inpatient      int64
number_diagnoses      int64
dtype: object

## <font color=green>to Categorical</font>

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    df[X_features], df['readmitted_cat'] , random_state = 0)

In [19]:
X_train.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
26485,4,70,0,15,0,0,1,9,0,188,...,1,1,0,3,0,2,1,0,1,1
98049,4,72,0,18,2,0,1,9,16,232,...,1,1,0,3,0,2,1,0,1,1


In [20]:
y_train.head(2)

26485    1
98049    0
Name: readmitted_cat, dtype: int8

In [21]:
X_test.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
17739,2,54,1,22,0,0,0,7,0,90,...,1,1,0,3,0,2,1,0,1,1
49767,10,66,5,32,0,0,0,9,16,233,...,1,1,0,3,0,2,1,0,1,1


In [22]:
y_test.head(2)

17739    0
49767    1
Name: readmitted_cat, dtype: int8

### <font color=green>min max scaling</font>

In [23]:
X_train.dtypes

time_in_hospital                int64
num_lab_procedures              int64
num_procedures                  int64
num_medications                 int64
number_outpatient               int64
number_emergency                int64
number_inpatient                int64
number_diagnoses                int64
discharge_disposition_id_cat     int8
diag_3_cat                      int16
gender_cat                       int8
diag_2_cat                      int16
chlorpropamide_cat               int8
troglitazone_cat                 int8
glipizide_cat                    int8
nateglinide_cat                  int8
insulin_cat                      int8
admission_source_id_cat          int8
diabetesMed_cat                  int8
admission_type_id_cat            int8
tolbutamide_cat                  int8
A1Cresult_cat                    int8
repaglinide_cat                  int8
acarbose_cat                     int8
glipizide-metformin_cat          int8
diag_1_cat                      int16
pioglitazone

### <font color=green>before scale</font>

In [24]:
X_train.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,...,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0
mean,4.680559,43.508392,1.207763,15.842858,0.398951,0.139612,0.614704,7.791223,6.936499,204.747429,...,1.059416,1.120025,3e-05,2.762424,0.000507,1.982679,0.562412,0.000954,1.006589,1.056703
std,3.036529,19.638966,1.621632,7.620044,1.330524,0.505624,1.123154,1.672566,6.830086,130.680163,...,0.246651,0.371884,0.00546,0.733813,0.022507,0.349831,0.496097,0.038597,0.082001,0.256811
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,31.0,0.0,11.0,0.0,0.0,0.0,7.0,0.0,92.0,...,1.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0
50%,4.0,45.0,1.0,15.0,0.0,0.0,0.0,9.0,8.0,189.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
75%,6.0,58.0,2.0,20.0,0.0,0.0,1.0,9.0,13.0,271.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
max,14.0,129.0,6.0,74.0,42.0,25.0,13.0,16.0,25.0,787.0,...,3.0,3.0,1.0,5.0,1.0,3.0,1.0,3.0,3.0,3.0


In [25]:
X_test.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,...,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0
mean,4.717288,43.846794,1.206511,15.932206,0.400233,0.136571,0.610947,7.810035,6.959038,201.142116,...,1.057866,1.120293,0.0,2.762544,0.000715,1.982023,0.56274,0.000447,1.004025,1.051784
std,3.022979,19.378839,1.619541,7.663627,1.321623,0.472164,1.076486,1.670391,6.805404,127.692508,...,0.242151,0.376549,0.0,0.73188,0.029899,0.342226,0.49607,0.025018,0.063315,0.244257
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,2.0,32.0,0.0,11.0,0.0,0.0,0.0,7.0,0.0,90.0,...,1.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0
50%,4.0,45.0,1.0,15.0,0.0,0.0,0.0,9.0,8.0,188.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
75%,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,13.0,248.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
max,14.0,109.0,6.0,70.0,33.0,6.0,11.0,16.0,24.0,782.0,...,3.0,3.0,0.0,5.0,2.0,3.0,1.0,2.0,2.0,3.0


In [26]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_features)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_features)

### <font color=green>after scale</font>

In [27]:
X_train.iloc[:,:20].describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,gender_cat,diag_2_cat,chlorpropamide_cat,troglitazone_cat,glipizide_cat,nateglinide_cat,insulin_cat,admission_source_id_cat,diabetesMed_cat,admission_type_id_cat
count,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0
mean,0.28312,0.332097,0.201294,0.203327,0.009499,0.005584,0.047285,0.368556,0.27746,0.260162,0.422353,0.272154,0.000686,0.0,0.377138,0.002812,0.463097,0.323626,0.759473,0.144633
std,0.233579,0.153429,0.270272,0.104384,0.031679,0.020225,0.086396,0.128659,0.273203,0.166048,0.493942,0.142374,0.019675,0.0,0.126002,0.033689,0.267632,0.227208,0.42741,0.207776
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076923,0.234375,0.0,0.136986,0.0,0.0,0.0,0.307692,0.0,0.1169,0.0,0.209893,0.0,0.0,0.333333,0.0,0.333333,0.0,1.0,0.0
50%,0.230769,0.34375,0.166667,0.191781,0.0,0.0,0.0,0.461538,0.32,0.240152,0.0,0.251337,0.0,0.0,0.333333,0.0,0.333333,0.5,1.0,0.0
75%,0.384615,0.445312,0.333333,0.260274,0.0,0.0,0.076923,0.461538,0.52,0.344346,1.0,0.358289,0.0,0.0,0.333333,0.0,0.666667,0.5,1.0,0.285714
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
X_train.iloc[:,21:40].describe()

Unnamed: 0,A1Cresult_cat,repaglinide_cat,acarbose_cat,glipizide-metformin_cat,diag_1_cat,pioglitazone_cat,age_cat,metformin_cat,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0
mean,0.640392,0.340518,0.001232,0.000149,0.363722,0.356657,0.247205,0.387234,0.353139,0.373342,3e-05,0.440606,0.000507,0.660893,0.562412,0.000318,0.33553,0.352234
std,0.16521,0.053356,0.022108,0.012208,0.164958,0.089475,0.304777,0.133336,0.082217,0.123961,0.00546,0.183453,0.022507,0.11661,0.496097,0.012866,0.027334,0.085604
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.666667,0.333333,0.0,0.0,0.273371,0.333333,0.0,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,0.0,0.0,0.333333,0.333333
50%,0.666667,0.333333,0.0,0.0,0.322946,0.333333,0.0,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,1.0,0.0,0.333333,0.333333
75%,0.666667,0.333333,0.0,0.0,0.46034,0.333333,0.5,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,1.0,0.0,0.333333,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
y_train.describe()

count    33543.000000
mean         1.402051
std          0.689581
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: readmitted_cat, dtype: float64

In [30]:
X_test.shape

(11181, 39)

In [31]:
X_test.iloc[:,:20].describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,gender_cat,diag_2_cat,chlorpropamide_cat,troglitazone_cat,glipizide_cat,nateglinide_cat,insulin_cat,admission_source_id_cat,diabetesMed_cat,admission_type_id_cat
count,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0
mean,0.285945,0.334741,0.201085,0.204551,0.009529,0.005463,0.046996,0.370003,0.278362,0.255581,0.426885,0.272414,0.000626,8.9e-05,0.377724,0.003488,0.463524,0.324272,0.757625,0.142154
std,0.232537,0.151397,0.269923,0.104981,0.031467,0.018887,0.082807,0.128492,0.272216,0.162252,0.494647,0.141379,0.020053,0.009457,0.127695,0.038581,0.268082,0.226807,0.428539,0.206599
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001271,0.0,0.002674,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076923,0.242188,0.0,0.136986,0.0,0.0,0.0,0.307692,0.0,0.114358,0.0,0.209893,0.0,0.0,0.333333,0.0,0.333333,0.0,1.0,0.0
50%,0.230769,0.34375,0.166667,0.191781,0.0,0.0,0.0,0.461538,0.32,0.238882,0.0,0.251337,0.0,0.0,0.333333,0.0,0.333333,0.5,1.0,0.0
75%,0.384615,0.4375,0.333333,0.260274,0.0,0.0,0.076923,0.461538,0.52,0.315121,1.0,0.360963,0.0,0.0,0.333333,0.0,0.666667,0.5,1.0,0.285714
max,1.0,0.84375,1.0,0.945205,0.785714,0.24,0.846154,1.0,0.96,0.993647,1.0,0.998663,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
X_test.iloc[:,21:40].describe()

Unnamed: 0,A1Cresult_cat,repaglinide_cat,acarbose_cat,glipizide-metformin_cat,diag_1_cat,pioglitazone_cat,age_cat,metformin_cat,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0
mean,0.640193,0.339564,0.000984,0.000179,0.366743,0.356021,0.248278,0.386429,0.352622,0.373431,0.0,0.440636,0.000715,0.660674,0.56274,0.000149,0.334675,0.350595
std,0.167276,0.049354,0.019151,0.013374,0.163248,0.088563,0.304671,0.13238,0.080717,0.125516,0.0,0.18297,0.029899,0.114075,0.49607,0.008339,0.021105,0.081419
min,0.0,0.0,0.0,0.0,0.001416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0
25%,0.666667,0.333333,0.0,0.0,0.273371,0.333333,0.0,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,0.0,0.0,0.333333,0.333333
50%,0.666667,0.333333,0.0,0.0,0.322946,0.333333,0.0,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,1.0,0.0,0.333333,0.333333
75%,0.666667,0.333333,0.0,0.0,0.46034,0.333333,0.5,0.333333,0.333333,0.333333,0.0,0.5,0.0,0.666667,1.0,0.0,0.333333,0.333333
max,1.0,1.0,0.666667,1.0,1.011331,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.666667,0.666667,1.0


In [33]:
y_test.describe()

count    11181.000000
mean         1.399428
std          0.696667
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: readmitted_cat, dtype: float64

<a id='SVM'></a>

### <font color=green>Baseline Categorical</font>

In [34]:
%%time
from sklearn.dummy import DummyClassifier

d_major = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

print('train accuracy: {:.2f}'.format(d_major.score(X_train, y_train)))
print('test accuracy: {:.2f}'.format(d_major.score(X_test, y_test)))

train accuracy: 0.52
test accuracy: 0.52
CPU times: user 4.26 ms, sys: 11 µs, total: 4.28 ms
Wall time: 3.89 ms


In [35]:
plot_confusion(d_major, 'Categorical Dummy Classifier', X_test, y_test)

<IPython.core.display.Javascript object>

Micro-averaged precision = 0.52 (treat instances equally)
Macro-averaged precision = 0.17 (treat classes equally)
Micro-averaged f1 = 0.52 (treat instances equally)
Macro-averaged f1 = 0.23 (treat classes equally)
             precision    recall  f1-score   support

        <30       0.00      0.00      0.00      1372
        >30       0.00      0.00      0.00      3971
         NO       0.52      1.00      0.69      5838

avg / total       0.27      0.52      0.36     11181



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [36]:
%%time
clf = LinearSVC(verbose=True).fit(X_train, y_train)

print('training accuracy: {:.2f}'.format(clf.score(X_train, y_train)))
print('test accuracy: {:.2f}'.format(clf.score(X_test, y_test)))
plot_confusion(clf, 'Categorical Linear Kernel', X_test, y_test)

[LibLinear]training accuracy: 0.54
test accuracy: 0.54


<IPython.core.display.Javascript object>

Micro-averaged precision = 0.54 (treat instances equally)
Macro-averaged precision = 0.34 (treat classes equally)
Micro-averaged f1 = 0.54 (treat instances equally)
Macro-averaged f1 = 0.31 (treat classes equally)
             precision    recall  f1-score   support

        <30       0.00      0.00      0.00      1372
        >30       0.46      0.18      0.26      3971
         NO       0.55      0.91      0.69      5838

avg / total       0.45      0.54      0.45     11181

CPU times: user 5.09 s, sys: 3.7 ms, total: 5.09 s
Wall time: 4.83 s


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### <font color=green>SVM optimise over accuracy</font>

In [37]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

clf = SVC() 

grid_values = [
  {'C': [0.1, 1, 10, 100, 1000], 'kernel':['linear', 'sigmoid']}, 
    {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'kernel':['rbf']},
    {'C': [0.1, 1, 10, 100, 1000], 'degree':[2, 3, 4, 5], 'kernel': ['poly']}
 ]

grid_clf_acc = GridSearchCV(clf, param_grid = grid_values, n_jobs= 4)
grid_clf_acc.fit(X_train, y_train)
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'C': 100, 'gamma': 0.05, 'kernel': 'rbf'}
Grid best score (accuracy):  0.542646751930358
CPU times: user 8min 3s, sys: 3.59 s, total: 8min 7s
Wall time: 4h 27min 19s


In [38]:
pd.DataFrame(grid_clf_acc.cv_results_).sort_values(by=['mean_test_score'], ascending=False).iloc[:,2:8]



Unnamed: 0,mean_test_score,mean_train_score,param_C,param_degree,param_gamma,param_kernel
33,0.542647,0.555988,100,,0.05,rbf
34,0.541424,0.579301,100,,0.1,rbf
40,0.540769,0.586173,1000,,0.05,rbf
61,0.540590,0.551426,1000,2,,poly
39,0.539904,0.545449,1000,,0.01,rbf
27,0.538920,0.546582,10,,0.1,rbf
21,0.537847,0.596354,1,,1,rbf
62,0.537847,0.547655,1000,3,,poly
41,0.534657,0.637391,1000,,0.1,rbf
32,0.534269,0.534091,100,,0.01,rbf


In [39]:
plot_confusion(grid_clf_acc, 'Categorical Grid Search', X_test,y_test)

<IPython.core.display.Javascript object>

Micro-averaged precision = 0.54 (treat instances equally)
Macro-averaged precision = 0.35 (treat classes equally)
Micro-averaged f1 = 0.54 (treat instances equally)
Macro-averaged f1 = 0.32 (treat classes equally)
             precision    recall  f1-score   support

        <30       0.00      0.00      0.00      1372
        >30       0.49      0.18      0.27      3971
         NO       0.55      0.92      0.69      5838

avg / total       0.46      0.54      0.45     11181



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [40]:
# precision recall curve only for binary class

<a id='hot'></a>

## <font color=Orange>One Hot</font>

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    df[X_features], df['readmitted_cat'] , random_state = 0)

In [42]:
X_train.dtypes

time_in_hospital                int64
num_lab_procedures              int64
num_procedures                  int64
num_medications                 int64
number_outpatient               int64
number_emergency                int64
number_inpatient                int64
number_diagnoses                int64
discharge_disposition_id_cat     int8
diag_3_cat                      int16
gender_cat                       int8
diag_2_cat                      int16
chlorpropamide_cat               int8
troglitazone_cat                 int8
glipizide_cat                    int8
nateglinide_cat                  int8
insulin_cat                      int8
admission_source_id_cat          int8
diabetesMed_cat                  int8
admission_type_id_cat            int8
tolbutamide_cat                  int8
A1Cresult_cat                    int8
repaglinide_cat                  int8
acarbose_cat                     int8
glipizide-metformin_cat          int8
diag_1_cat                      int16
pioglitazone

In [43]:
X_train.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,...,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0,33543.0
mean,4.680559,43.508392,1.207763,15.842858,0.398951,0.139612,0.614704,7.791223,6.936499,204.747429,...,1.059416,1.120025,3e-05,2.762424,0.000507,1.982679,0.562412,0.000954,1.006589,1.056703
std,3.036529,19.638966,1.621632,7.620044,1.330524,0.505624,1.123154,1.672566,6.830086,130.680163,...,0.246651,0.371884,0.00546,0.733813,0.022507,0.349831,0.496097,0.038597,0.082001,0.256811
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,31.0,0.0,11.0,0.0,0.0,0.0,7.0,0.0,92.0,...,1.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0
50%,4.0,45.0,1.0,15.0,0.0,0.0,0.0,9.0,8.0,189.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
75%,6.0,58.0,2.0,20.0,0.0,0.0,1.0,9.0,13.0,271.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
max,14.0,129.0,6.0,74.0,42.0,25.0,13.0,16.0,25.0,787.0,...,3.0,3.0,1.0,5.0,1.0,3.0,1.0,3.0,3.0,3.0


In [44]:
X_test.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,discharge_disposition_id_cat,diag_3_cat,...,rosiglitazone_cat,glyburide_cat,acetohexamide_cat,race_cat,tolazamide_cat,max_glu_serum_cat,change_cat,miglitol_cat,glyburide-metformin_cat,glimepiride_cat
count,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,...,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0,11181.0
mean,4.717288,43.846794,1.206511,15.932206,0.400233,0.136571,0.610947,7.810035,6.959038,201.142116,...,1.057866,1.120293,0.0,2.762544,0.000715,1.982023,0.56274,0.000447,1.004025,1.051784
std,3.022979,19.378839,1.619541,7.663627,1.321623,0.472164,1.076486,1.670391,6.805404,127.692508,...,0.242151,0.376549,0.0,0.73188,0.029899,0.342226,0.49607,0.025018,0.063315,0.244257
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,2.0,32.0,0.0,11.0,0.0,0.0,0.0,7.0,0.0,90.0,...,1.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0
50%,4.0,45.0,1.0,15.0,0.0,0.0,0.0,9.0,8.0,188.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
75%,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,13.0,248.0,...,1.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0
max,14.0,109.0,6.0,70.0,33.0,6.0,11.0,16.0,24.0,782.0,...,3.0,3.0,0.0,5.0,2.0,3.0,1.0,2.0,2.0,3.0


In [45]:
to_num

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

### <font color=orange>scale numerical</font>

In [46]:
scaler = MinMaxScaler()
X_train_hot = pd.DataFrame(scaler.fit_transform(X_train[to_num]), columns=to_num)
X_test_hot = pd.DataFrame(scaler.transform(X_test[to_num]), columns = to_num)

In [47]:
from sklearn.preprocessing import OneHotEncoder
hot_features = list(set(X_features) - set(to_num))
hot_features

['A1Cresult_cat',
 'change_cat',
 'nateglinide_cat',
 'tolbutamide_cat',
 'pioglitazone_cat',
 'acetohexamide_cat',
 'acarbose_cat',
 'troglitazone_cat',
 'glipizide_cat',
 'chlorpropamide_cat',
 'insulin_cat',
 'discharge_disposition_id_cat',
 'glyburide_cat',
 'glimepiride_cat',
 'race_cat',
 'admission_type_id_cat',
 'tolazamide_cat',
 'glyburide-metformin_cat',
 'diag_2_cat',
 'max_glu_serum_cat',
 'miglitol_cat',
 'glipizide-metformin_cat',
 'age_cat',
 'gender_cat',
 'diag_3_cat',
 'repaglinide_cat',
 'diabetesMed_cat',
 'rosiglitazone_cat',
 'admission_source_id_cat',
 'diag_1_cat',
 'metformin_cat']

In [48]:
enc = OneHotEncoder()
enc.fit(df[hot_features])
enc.n_values_

array([  4,   2,   4,   2,   4,   2,   4,   2,   4,   4,   4,  26,   4,
         4,   6,   8,   3,   4, 749,   4,   4,   2,  10,   2, 788,   4,
         2,   4,  17, 715,   4])

In [49]:
enc.feature_indices_

array([   0,    4,    6,   10,   12,   16,   18,   22,   24,   28,   32,
         36,   62,   66,   70,   76,   84,   87,   91,  840,  844,  848,
        850,  860,  862, 1650, 1654, 1656, 1660, 1677, 2392, 2396])

### <font color=orange> convert to one hot </font>

In [50]:
X_train_hot = pd.concat([X_train_hot, \
                         pd.DataFrame(enc.transform(X_train[hot_features]).toarray())], axis=1)

X_test_hot = pd.concat([X_test_hot, \
                         pd.DataFrame(enc.transform(X_test[hot_features]).toarray())], axis=1)

In [51]:
X_train_hot.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,0,1,...,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867
0,0.230769,0.539062,0.0,0.191781,0.0,0.0,0.076923,0.461538,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.230769,0.554688,0.0,0.232877,0.047619,0.0,0.076923,0.461538,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [52]:
X_test_hot.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,0,1,...,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867
0,0.076923,0.414062,0.166667,0.287671,0.0,0.0,0.0,0.307692,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.692308,0.507812,0.833333,0.424658,0.0,0.0,0.0,0.461538,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### <font color=orange> SVM one hot</font>

In [53]:
%%time
from sklearn.dummy import DummyClassifier

d_major = DummyClassifier(strategy='most_frequent').fit(X_train_hot, y_train)

print('train accuracy: {:.2f}'.format(d_major.score(X_train_hot, y_train)))
print('test accuracy: {:.2f}'.format(d_major.score(X_test_hot, y_test)))
plot_confusion(d_major, 'One Hot Dummy Classifier', X_test_hot, y_test)

train accuracy: 0.52
test accuracy: 0.52


<IPython.core.display.Javascript object>

Micro-averaged precision = 0.52 (treat instances equally)
Macro-averaged precision = 0.17 (treat classes equally)
Micro-averaged f1 = 0.52 (treat instances equally)
Macro-averaged f1 = 0.23 (treat classes equally)
             precision    recall  f1-score   support

        <30       0.00      0.00      0.00      1372
        >30       0.00      0.00      0.00      3971
         NO       0.52      1.00      0.69      5838

avg / total       0.27      0.52      0.36     11181

CPU times: user 224 ms, sys: 337 ms, total: 561 ms
Wall time: 558 ms


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
%%time
clf = LinearSVC(verbose=True).fit(X_train_hot, y_train)

print('training accuracy: {:.2f}'.format(clf.score(X_train_hot, y_train)))
print('test accuracy: {:.2f}'.format(clf.score(X_test_hot, y_test)))
plot_confusion(clf, 'One Hot Linear Kernel', X_test_hot, y_test )

[LibLinear]training accuracy: 0.59
test accuracy: 0.54




<IPython.core.display.Javascript object>

Micro-averaged precision = 0.54 (treat instances equally)
Macro-averaged precision = 0.43 (treat classes equally)
Micro-averaged f1 = 0.54 (treat instances equally)
Macro-averaged f1 = 0.37 (treat classes equally)
             precision    recall  f1-score   support

        <30       0.24      0.02      0.03      1372
        >30       0.46      0.39      0.42      3971
         NO       0.58      0.77      0.66      5838

avg / total       0.50      0.54      0.50     11181

CPU times: user 18.2 s, sys: 124 ms, total: 18.3 s
Wall time: 17.8 s


### <font color=orange>SVM optimise over accuracy</font>

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

clf = SVC() 

grid_values = [
  {'C': [0.1, 1, 10, 100, 1000], 'kernel':['linear', 'sigmoid']}, 
    {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'kernel':['rbf']},
    {'C': [0.1, 1, 10, 100, 1000], 'degree':[2, 3, 4, 5], 'kernel': ['poly']}
 ]

grid_clf_acc = GridSearchCV(clf, param_grid = grid_values, n_jobs= 4 )
grid_clf_acc.fit(X_train_hot, y_train)
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test_hot) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

In [None]:
pd.DataFrame(grid_clf_acc.cv_results_).sort_values(by=['mean_test_score'], ascending=False).iloc[:,2:8]

In [None]:
plot_confusion(grid_clf_acc, 'One Hot Grid Search', X_test_hot, y_test)