## <font color=blue>Diabetes dataset<font>

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from tqdm import tqdm

# df = pd.read_csv("diabetic_data.csv", dtype='category').iloc[:500,2:]
df = pd.read_csv("diabetic_data.csv", dtype='category').iloc[:,2:]
df.shape

(101766, 48)

In [2]:
df.head(2)

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),?,1,1,7,3,?,?,...,No,Up,No,No,No,No,No,Ch,Yes,>30


In [3]:
df.describe()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,...,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766
unique,6,3,10,10,8,26,17,14,18,73,...,1,4,4,2,2,2,2,2,2,3
top,Caucasian,Female,[70-80),?,1,1,7,3,?,?,...,No,No,No,No,No,No,No,No,Yes,NO
freq,76099,54708,26068,98569,53990,60234,57494,17756,40256,49949,...,101766,47383,101060,101753,101765,101764,101765,54755,78363,54864


In [4]:
def show_unique(dataF):
    for c in dataF.columns:
        print(c, dataF[c].unique())
        print('*'*50)

show_unique(df)

race [Caucasian, AfricanAmerican, ?, Other, Asian, Hispanic]
Categories (6, object): [Caucasian, AfricanAmerican, ?, Other, Asian, Hispanic]
**************************************************
gender [Female, Male, Unknown/Invalid]
Categories (3, object): [Female, Male, Unknown/Invalid]
**************************************************
age [[0-10), [10-20), [20-30), [30-40), [40-50), [50-60), [60-70), [70-80), [80-90), [90-100)]
Categories (10, object): [[0-10), [10-20), [20-30), [30-40), ..., [60-70), [70-80), [80-90), [90-100)]
**************************************************
weight [?, [75-100), [50-75), [0-25), [100-125), [25-50), [125-150), [175-200), [150-175), >200]
Categories (10, object): [?, [75-100), [50-75), [0-25), ..., [125-150), [175-200), [150-175), >200]
**************************************************
admission_type_id [6, 1, 2, 3, 4, 5, 8, 7]
Categories (8, object): [6, 1, 2, 3, 4, 5, 8, 7]
**************************************************
discharge_disposition_

### <font color =blue>1. remove columns with missing data</font>

In [5]:
df = df.drop(columns=['weight', 'payer_code', 'medical_specialty'])
df.head(2)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30


### <font color=blue> 2. remove incomplete columns and rows </font>

In [6]:
# drop missing values
df = df.replace('?', np.nan)
df = df.replace('Unknown/Invalid', np.nan)
df = df.dropna()
df.shape

(98052, 45)

In [7]:
df = df.drop(columns= df.columns[df.nunique() == 1])

In [8]:
show_unique(df)

race [Caucasian, AfricanAmerican, Other, Asian, Hispanic]
Categories (5, object): [Caucasian, AfricanAmerican, Other, Asian, Hispanic]
**************************************************
gender [Female, Male]
Categories (2, object): [Female, Male]
**************************************************
age [[10-20), [20-30), [30-40), [40-50), [50-60), [60-70), [70-80), [80-90), [90-100), [0-10)]
Categories (10, object): [[10-20), [20-30), [30-40), [40-50), ..., [70-80), [80-90), [90-100), [0-10)]
**************************************************
admission_type_id [1, 2, 3, 6, 4, 5, 8, 7]
Categories (8, object): [1, 2, 3, 6, 4, 5, 8, 7]
**************************************************
discharge_disposition_id [1, 3, 6, 2, 5, ..., 15, 24, 28, 19, 27]
Length: 26
Categories (26, object): [1, 3, 6, 2, ..., 24, 28, 19, 27]
**************************************************
admission_source_id [7, 2, 4, 1, 5, ..., 10, 22, 11, 25, 13]
Length: 17
Categories (17, object): [7, 2, 4, 1, ..., 22, 11, 

### <font color = blue>3. categorical variables</font>

In [9]:
to_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
         'num_medications', 'number_outpatient', 'number_emergency',
         'number_inpatient', 'number_diagnoses']

to_cat_codes = list(set(df.columns) - set(to_num))

In [10]:
X_features = list(to_num)
for c in to_cat_codes:
    df[c+'_cat'] = df[c].cat.codes        
    X_features += [c+'_cat']
    
X_features.remove('readmitted_cat')
X_features

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'diag_3_cat',
 'A1Cresult_cat',
 'max_glu_serum_cat',
 'admission_source_id_cat',
 'admission_type_id_cat',
 'chlorpropamide_cat',
 'insulin_cat',
 'diag_2_cat',
 'glipizide_cat',
 'metformin_cat',
 'change_cat',
 'acetohexamide_cat',
 'glimepiride-pioglitazone_cat',
 'race_cat',
 'age_cat',
 'glyburide-metformin_cat',
 'diabetesMed_cat',
 'tolazamide_cat',
 'diag_1_cat',
 'discharge_disposition_id_cat',
 'glyburide_cat',
 'troglitazone_cat',
 'tolbutamide_cat',
 'rosiglitazone_cat',
 'gender_cat',
 'miglitol_cat',
 'glipizide-metformin_cat',
 'metformin-pioglitazone_cat',
 'pioglitazone_cat',
 'repaglinide_cat',
 'nateglinide_cat',
 'acarbose_cat',
 'glimepiride_cat']

In [11]:
df['readmitted'].head(11)

1     >30
2      NO
3      NO
4      NO
5     >30
6      NO
7     >30
8      NO
9      NO
10    >30
11    <30
Name: readmitted, dtype: category
Categories (3, object): [<30, >30, NO]

In [12]:
df['readmitted_cat'].head(11)

1     1
2     2
3     2
4     2
5     1
6     2
7     1
8     2
9     2
10    1
11    0
Name: readmitted_cat, dtype: int8

#### <font color=red>Target Mapping: < 30 = 0, >30 = 1, NO = 2, </font>

### <font color=blue>4. split</font>

In [13]:
for n in to_num:
    df[n] = df[n].astype('int')

df[to_num].dtypes

time_in_hospital      int64
num_lab_procedures    int64
num_procedures        int64
num_medications       int64
number_outpatient     int64
number_emergency      int64
number_inpatient      int64
number_diagnoses      int64
dtype: object

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df[X_features], df['readmitted_cat'] , random_state = 0)

In [15]:
X_train.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,...,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat,glimepiride_cat
71878,5,50,1,23,0,0,0,9,23,2,...,1,0,0,0,0,1,1,0,0,1
58959,7,70,2,32,0,0,3,9,189,2,...,1,1,0,0,0,1,1,0,0,1


In [16]:
y_train.head(2)

71878    0
58959    2
Name: readmitted_cat, dtype: int8

In [17]:
X_test.head(2)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,...,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat,glimepiride_cat
101706,3,66,1,18,0,0,1,9,221,0,...,1,1,0,0,0,2,1,0,0,1
73363,2,4,0,11,1,0,1,7,46,2,...,1,1,0,0,0,1,1,0,0,1


In [18]:
y_test.head(2)

101706    1
73363     1
Name: readmitted_cat, dtype: int8

### <font color=blue>5. min max scaling</font>

In [19]:
X_train.dtypes

time_in_hospital                int64
num_lab_procedures              int64
num_procedures                  int64
num_medications                 int64
number_outpatient               int64
number_emergency                int64
number_inpatient                int64
number_diagnoses                int64
diag_3_cat                      int16
A1Cresult_cat                    int8
max_glu_serum_cat                int8
admission_source_id_cat          int8
admission_type_id_cat            int8
chlorpropamide_cat               int8
insulin_cat                      int8
diag_2_cat                      int16
glipizide_cat                    int8
metformin_cat                    int8
change_cat                       int8
acetohexamide_cat                int8
glimepiride-pioglitazone_cat     int8
race_cat                         int8
age_cat                          int8
glyburide-metformin_cat          int8
diabetesMed_cat                  int8
tolazamide_cat                   int8
diag_1_cat  

#### before scale

In [20]:
X_train.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,...,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat,glimepiride_cat
count,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,...,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0
mean,4.427678,43.155305,1.348998,16.0989,0.372496,0.201988,0.646229,7.509974,202.153225,1.895783,...,1.06349,0.46185,0.000422,0.000122,1.4e-05,1.071663,1.015801,0.007792,0.003155,1.050232
std,3.000109,19.748888,1.704406,8.0961,1.254258,0.930706,1.264435,1.833319,137.354758,0.517859,...,0.254166,0.498546,0.024178,0.011062,0.003688,0.271041,0.13697,0.09634,0.059148,0.240641
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,31.0,0.0,11.0,0.0,0.0,0.0,6.0,90.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
50%,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0,182.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,273.0,2.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
max,14.0,129.0,6.0,79.0,42.0,76.0,21.0,16.0,789.0,3.0,...,3.0,1.0,3.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0


In [21]:
X_test.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,...,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat,glimepiride_cat
count,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,...,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0
mean,4.40501,43.127932,1.355852,16.181659,0.388039,0.203892,0.648799,7.51846,202.719455,1.89683,...,1.061314,0.459144,0.000571,0.000163,0.0,1.074409,1.01542,0.007139,0.003386,1.050585
std,2.971848,19.600328,1.720647,8.145418,1.366935,0.978576,1.290618,1.829952,136.473911,0.51638,...,0.251371,0.498338,0.029953,0.012773,0.0,0.275634,0.134005,0.094684,0.062163,0.241135
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,32.0,0.0,11.0,0.0,0.0,0.0,6.0,90.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
50%,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0,182.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,272.0,2.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
max,14.0,132.0,6.0,81.0,40.0,63.0,16.0,16.0,784.0,3.0,...,3.0,1.0,3.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0


In [22]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_features)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_features)

#### after scale

In [23]:
X_train.iloc[:,:20].describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,max_glu_serum_cat,admission_source_id_cat,admission_type_id_cat,chlorpropamide_cat,insulin_cat,diag_2_cat,glipizide_cat,metformin_cat,change_cat,acetohexamide_cat
count,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0
mean,0.263668,0.329338,0.224833,0.193576,0.008869,0.002658,0.030773,0.346921,0.256214,0.631928,0.661318,0.333784,0.146428,0.333637,0.466406,0.267631,0.373838,0.398374,0.54004,0.0
std,0.230778,0.154288,0.284068,0.103796,0.029863,0.012246,0.060211,0.141025,0.174087,0.17262,0.104091,0.246454,0.205807,0.010921,0.278901,0.152373,0.121723,0.144979,0.498398,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076923,0.234375,0.0,0.128205,0.0,0.0,0.0,0.230769,0.114068,0.666667,0.666667,0.0,0.0,0.333333,0.333333,0.13253,0.333333,0.333333,0.0,0.0
50%,0.230769,0.335938,0.166667,0.179487,0.0,0.0,0.0,0.384615,0.230672,0.666667,0.666667,0.533333,0.0,0.333333,0.333333,0.251673,0.333333,0.333333,1.0,0.0
75%,0.384615,0.4375,0.333333,0.24359,0.0,0.0,0.047619,0.461538,0.346008,0.666667,0.666667,0.533333,0.285714,0.333333,0.666667,0.368139,0.333333,0.333333,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [24]:
X_train.iloc[:,21:40].describe()

Unnamed: 0,race_cat,age_cat,glyburide-metformin_cat,diabetesMed_cat,tolazamide_cat,diag_1_cat,discharge_disposition_id_cat,glyburide_cat,troglitazone_cat,tolbutamide_cat,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat
count,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0,73539.0
mean,0.414369,0.680076,0.335591,0.766355,0.00034,0.356752,0.210068,0.367279,2.7e-05,0.000231,0.354497,0.46185,0.000141,0.000122,1.4e-05,0.357221,0.3386,0.002597,0.001052
std,0.215786,0.173289,0.027776,0.423152,0.018435,0.170712,0.267686,0.114944,0.005215,0.015203,0.084722,0.498546,0.008059,0.011062,0.003688,0.090347,0.045657,0.032113,0.019716
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.555556,0.333333,1.0,0.0,0.265363,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
50%,0.5,0.666667,0.333333,1.0,0.0,0.318436,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
75%,0.5,0.777778,0.333333,1.0,0.0,0.488827,0.52,0.333333,0.0,0.0,0.333333,1.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
y_train.describe()

count    73539.000000
mean         1.423068
std          0.684500
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: readmitted_cat, dtype: float64

In [26]:
X_test.shape

(24513, 41)

In [27]:
X_test.iloc[:,:20].describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diag_3_cat,A1Cresult_cat,max_glu_serum_cat,admission_source_id_cat,admission_type_id_cat,chlorpropamide_cat,insulin_cat,diag_2_cat,glipizide_cat,metformin_cat,change_cat,acetohexamide_cat
count,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0
mean,0.261924,0.329124,0.225975,0.194637,0.009239,0.002683,0.030895,0.347574,0.256932,0.632277,0.661051,0.335291,0.145188,0.333605,0.46884,0.268786,0.374359,0.398754,0.532779,4.1e-05
std,0.228604,0.153128,0.286775,0.104428,0.032546,0.012876,0.061458,0.140766,0.172971,0.172127,0.103387,0.246326,0.204768,0.009983,0.282084,0.153605,0.121854,0.145039,0.498935,0.006387
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001267,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076923,0.242188,0.0,0.128205,0.0,0.0,0.0,0.230769,0.114068,0.666667,0.666667,0.0,0.0,0.333333,0.333333,0.13253,0.333333,0.333333,0.0,0.0
50%,0.230769,0.335938,0.166667,0.179487,0.0,0.0,0.0,0.384615,0.230672,0.666667,0.666667,0.533333,0.0,0.333333,0.333333,0.251673,0.333333,0.333333,1.0,0.0
75%,0.384615,0.4375,0.333333,0.24359,0.0,0.0,0.047619,0.461538,0.34474,0.666667,0.666667,0.533333,0.285714,0.333333,0.666667,0.369478,0.333333,0.333333,1.0,0.0
max,1.0,1.023438,1.0,1.025641,0.952381,0.828947,0.761905,1.0,0.993663,1.0,1.0,1.066667,1.0,1.0,1.0,1.001339,1.0,1.0,1.0,1.0


In [28]:
X_test.iloc[:,21:40].describe()

Unnamed: 0,race_cat,age_cat,glyburide-metformin_cat,diabetesMed_cat,tolazamide_cat,diag_1_cat,discharge_disposition_id_cat,glyburide_cat,troglitazone_cat,tolbutamide_cat,rosiglitazone_cat,gender_cat,miglitol_cat,glipizide-metformin_cat,metformin-pioglitazone_cat,pioglitazone_cat,repaglinide_cat,nateglinide_cat,acarbose_cat
count,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0,24513.0
mean,0.415881,0.68245,0.335659,0.774813,0.00053,0.357147,0.208241,0.367152,4.1e-05,0.000204,0.353771,0.459144,0.00019,0.000163,0.0,0.358136,0.338473,0.00238,0.001129
std,0.213487,0.17305,0.027907,0.417714,0.024732,0.170499,0.266952,0.114152,0.006387,0.014281,0.08379,0.498338,0.009984,0.012773,0.0,0.091878,0.044668,0.031561,0.020721
min,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.555556,0.333333,1.0,0.0,0.265363,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
50%,0.5,0.666667,0.333333,1.0,0.0,0.314246,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
75%,0.5,0.777778,0.333333,1.0,0.0,0.488827,0.52,0.333333,0.0,0.0,0.333333,1.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0
max,1.0,1.0,1.0,1.0,2.0,0.998603,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [29]:
y_test.describe()

count    24513.000000
mean         1.414433
std          0.687174
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: readmitted_cat, dtype: float64

## <font color=green> Baseline</font>

In [43]:
%%time
from sklearn.dummy import DummyClassifier

d_major = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

print(s, 'train accuracy: {:.2f}'.format(d_major.score(X_train, y_train)))
print(s, 'test accuracy: {:.2f}'.format(d_major.score(X_test, y_test)))

uniform train accuracy: 0.54
uniform test accuracy: 0.53
CPU times: user 8.01 ms, sys: 0 ns, total: 8.01 ms
Wall time: 7.23 ms


In [44]:
def plot_confusion(clf):
    svm_predicted_mc = clf.predict(X_test)
    confusion_mc = confusion_matrix(y_test, svm_predicted_mc)
    df_cm = pd.DataFrame(confusion_mc, 
                         index = [i for i in range(0,3)], columns = [i for i in range(0,3)])

    plt.figure(figsize=(6,4))
    ax_ticks= ['<30', '>30', 'NO']
    sns.heatmap(df_cm, annot=True, xticklabels=ax_ticks, yticklabels=ax_ticks)
    plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, 
                                                                           svm_predicted_mc)))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    print('Micro-averaged precision = {:.2f} (treat instances equally)'
      .format(precision_score(y_test, svm_predicted_mc, average = 'micro')))
    print('Macro-averaged precision = {:.2f} (treat classes equally)'
      .format(precision_score(y_test, svm_predicted_mc, average = 'macro')))
    print('Micro-averaged f1 = {:.2f} (treat instances equally)'
          .format(f1_score(y_test, svm_predicted_mc, average = 'micro')))
    print('Macro-averaged f1 = {:.2f} (treat classes equally)'
          .format(f1_score(y_test, svm_predicted_mc, average = 'macro')))
    print(classification_report(y_test, svm_predicted_mc))

plot_confusion(d_major)

<IPython.core.display.Javascript object>

Micro-averaged precision = 0.53 (treat instances equally)
Macro-averaged precision = 0.18 (treat classes equally)
Micro-averaged f1 = 0.53 (treat instances equally)
Macro-averaged f1 = 0.23 (treat classes equally)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      2813
          1       0.00      0.00      0.00      8728
          2       0.53      1.00      0.69     12972

avg / total       0.28      0.53      0.37     24513



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [32]:
%%time
clf = LinearSVC(verbose=True).fit(X_train, y_train)

print('training accuracy: {:.2f}'.format(clf.score(X_train, y_train)))
print('test accuracy: {:.2f}'.format(clf.score(X_test, y_test)))
plot_confusion(clf)

[LibLinear]training accuracy: 0.56
test accuracy: 0.56


<IPython.core.display.Javascript object>

Micro-averaged precision = 0.56 (treat instances equally)
Macro-averaged precision = 0.35 (treat classes equally)
Micro-averaged f1 = 0.56 (treat instances equally)
Macro-averaged f1 = 0.33 (treat classes equally)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      2813
          1       0.49      0.20      0.28      8728
          2       0.57      0.92      0.70     12972

avg / total       0.48      0.56      0.47     24513

CPU times: user 11.6 s, sys: 11.7 ms, total: 11.6 s
Wall time: 11.4 s


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## <font color=green>SVM</font>

### <font color=green>optimise over accuracy</font>

In [33]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

clf = LinearSVC() 

grid_values = [
  {'C': [0.1, 1, 10, 100, 1000]}  
 ]

grid_clf_acc = GridSearchCV(clf, param_grid = grid_values, n_jobs=4)
# grid_clf_acc.fit(X_train, np.ravel(y_train))
grid_clf_acc.fit(X_train, y_train)
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'C': 10}
Grid best score (accuracy):  0.5620283115081793
CPU times: user 45.3 s, sys: 60 ms, total: 45.3 s
Wall time: 3min 1s


In [34]:
pd.DataFrame(grid_clf_acc.cv_results_).sort_values(by=['mean_test_score'], ascending=False).iloc[:,2:7]



Unnamed: 0,mean_test_score,mean_train_score,param_C,params,rank_test_score
2,0.562028,0.562606,10.0,{'C': 10},1
1,0.561879,0.562463,1.0,{'C': 1},2
0,0.560859,0.561539,0.1,{'C': 0.1},3
3,0.545955,0.545649,100.0,{'C': 100},4
4,0.453732,0.452754,1000.0,{'C': 1000},5


In [35]:
plot_confusion(grid_clf_acc)

<IPython.core.display.Javascript object>

Micro-averaged precision = 0.56 (treat instances equally)
Macro-averaged precision = 0.35 (treat classes equally)
Micro-averaged f1 = 0.56 (treat instances equally)
Macro-averaged f1 = 0.33 (treat classes equally)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      2813
          1       0.49      0.20      0.29      8728
          2       0.57      0.92      0.70     12972

avg / total       0.48      0.56      0.47     24513



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [36]:
y_decision_fn_scores_acc 

array([[-0.76085345, -0.15130482, -0.09013007],
       [-0.77797906, -0.27883622,  0.05921927],
       [-0.59202842, -0.22086706, -0.21302769],
       ...,
       [-0.77159068, -0.29713496,  0.06893687],
       [-0.82292563, -0.43837659,  0.24311493],
       [-0.77348539, -0.16794414, -0.04252339]])

In [37]:
# precision recall curve only for binary class