In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('drug200.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [3]:
from sklearn.model_selection import train_test_split
target = 'Drug'
X = df.drop([target],axis = 1)
y = df[target]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [4]:
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']

In [5]:
categorical

['Sex', 'BP', 'Cholesterol']

In [6]:
numerical

['Age', 'Na_to_K']

##### Preprocessing 

In [7]:
# Encoding
import category_encoders as ce
encoder = ce.OneHotEncoder(cols = categorical)
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

In [8]:
X_train.head()

Unnamed: 0,Age,Sex_1,Sex_2,BP_1,BP_2,BP_3,Cholesterol_1,Cholesterol_2,Na_to_K
190,58,1,0,1,0,0,1,0,18.991
7,41,1,0,0,1,0,1,0,11.037
26,31,1,0,1,0,0,1,0,30.366
138,51,1,0,1,0,0,0,1,11.343
58,60,1,0,0,0,1,0,1,10.091


In [9]:
# it is good to drop one column from each attribute after applying one hot encoding to avoid problems
to_drop = ['Sex_1','BP_1','Cholesterol_1']
X_train.drop(to_drop,axis = 1,inplace = True)
X_test.drop(to_drop,axis = 1,inplace = True)
cols = X_train.columns 

###### Feature Scaling

In [10]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [13]:
X_train.head()

Unnamed: 0,Age,Sex_2,BP_2,BP_3,Cholesterol_2,Na_to_K
0,0.40367,0.0,0.0,0.0,0.0,0.533932
1,-0.220183,0.0,1.0,0.0,0.0,-0.311811
2,-0.587156,0.0,0.0,0.0,0.0,1.743428
3,0.146789,0.0,0.0,0.0,1.0,-0.279274
4,0.477064,0.0,0.0,1.0,1.0,-0.412398


##### Hyperparameter for SVM

| Hyperparameter | Value                     |
|----------------|---------------------------|
| C              | Positive float            |
|                | Default value: 1.0        |
| kernel         | [’linear’, ’poly’, ’rbf’, ’sigmoid’] |
|                | Default value: ’rbf’      |
| gamma          | [’scale’, ’auto’] or positive float |
|                | Default value: ’scale’    |
| degree         | Positive integer          |
|                | Default value: 3          |


###### With default Hyperparameters

In [15]:
from sklearn.model_selection import cross_val_score
model = cross_val_score(SVC(),X_train,y_train,cv = 5)
model.mean()

0.95625

###### Using grid search library for tuning 

In [18]:
# lets first define the combinations we want for each parameter
C_val = [1,5,10,20]
kernel_val = ['linear', 'poly', 'rbf', 'sigmoid'] 
gamma_val = ['scale', 'auto'] 
degree_val = [2,3,4]

# there will be a total of 4 * 4 * 2 * 3 = 96 combinations
# it will be very difficult to test all this combinations manually, so we use api from grid search

In [20]:
from sklearn.model_selection import GridSearchCV
model_tuning = GridSearchCV(SVC(),{
    'C' : C_val,
    'kernel' : kernel_val,
    'degree' : degree_val,
    'gamma' : gamma_val
},cv = 5,return_train_score=False)
model_tuning.fit(X_train,y_train)

{'mean_fit_time': array([0.00538507, 0.00319405, 0.00359449, 0.00319114, 0.00239291,
        0.00319037, 0.00299139, 0.00299244, 0.00319223, 0.00299225,
        0.00299211, 0.00239339, 0.00259304, 0.00279336, 0.00319138,
        0.00299172, 0.00279264, 0.00299401, 0.0031919 , 0.00299253,
        0.00259299, 0.00279288, 0.00319138, 0.00299296, 0.00259318,
        0.00299182, 0.00299139, 0.0021945 , 0.00279169, 0.00259128,
        0.00299025, 0.00319133, 0.00259318, 0.00279255, 0.0031909 ,
        0.00259252, 0.00259266, 0.00299273, 0.00299201, 0.00299077,
        0.00518312, 0.00378933, 0.00438905, 0.00378928, 0.00239329,
        0.00299239, 0.00338392, 0.00319133, 0.00259295, 0.00279241,
        0.00299187, 0.00259218, 0.00219445, 0.00279379, 0.00319057,
        0.00279284, 0.00279455, 0.00299211, 0.00299191, 0.00279284,
        0.00239348, 0.00299382, 0.00299354, 0.00279269, 0.00279255,
        0.00299211, 0.00319114, 0.00279121, 0.00259299, 0.00279202,
        0.0029912 , 0.00279241,

In [21]:
df_results = pd.DataFrame(model_tuning.cv_results_)
df_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005385,0.001954,0.002792,1.163032e-03,1,2,scale,linear,"{'C': 1, 'degree': 2, 'gamma': 'scale', 'kerne...",0.93750,0.87500,1.00000,0.93750,0.90625,0.93125,0.041458,43
1,0.003194,0.000407,0.001592,4.936433e-04,1,2,scale,poly,"{'C': 1, 'degree': 2, 'gamma': 'scale', 'kerne...",0.78125,0.81250,0.84375,0.87500,0.93750,0.85000,0.053765,69
2,0.003594,0.000483,0.002190,3.934552e-04,1,2,scale,rbf,"{'C': 1, 'degree': 2, 'gamma': 'scale', 'kerne...",0.96875,0.93750,0.96875,0.96875,0.93750,0.95625,0.015309,7
3,0.003191,0.000399,0.001796,3.986603e-04,1,2,scale,sigmoid,"{'C': 1, 'degree': 2, 'gamma': 'scale', 'kerne...",0.90625,0.84375,0.90625,1.00000,0.78125,0.88750,0.072887,63
4,0.002393,0.000488,0.001596,4.885199e-04,1,2,auto,linear,"{'C': 1, 'degree': 2, 'gamma': 'auto', 'kernel...",0.93750,0.87500,1.00000,0.93750,0.90625,0.93125,0.041458,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.002394,0.000488,0.001595,4.881112e-04,20,4,scale,sigmoid,"{'C': 20, 'degree': 4, 'gamma': 'scale', 'kern...",0.78125,0.84375,0.93750,0.84375,0.71875,0.82500,0.072887,75
92,0.002988,0.000631,0.000997,8.609519e-07,20,4,auto,linear,"{'C': 20, 'degree': 4, 'gamma': 'auto', 'kerne...",0.93750,0.93750,1.00000,0.96875,0.96875,0.96250,0.023385,1
93,0.003191,0.000401,0.002194,7.472338e-04,20,4,auto,poly,"{'C': 20, 'degree': 4, 'gamma': 'auto', 'kerne...",0.65625,0.75000,0.75000,0.62500,0.71875,0.70000,0.050775,90
94,0.003588,0.000489,0.001995,1.211955e-06,20,4,auto,rbf,"{'C': 20, 'degree': 4, 'gamma': 'auto', 'kerne...",0.96875,0.93750,1.00000,0.90625,0.96875,0.95625,0.031869,7


In [31]:
tuning_results = df_results[['param_C','param_degree','param_gamma','param_kernel','mean_test_score']].sort_values(by = 'mean_test_score',ascending = False)

In [32]:
# tuned params are
tuning_results.head(1)

Unnamed: 0,param_C,param_degree,param_gamma,param_kernel,mean_test_score
76,20,2,auto,linear,0.9625


##### Model Selection

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
model_params = {
    'svm' : {
        'model' : SVC(),
        'params' : {
            'C' : C_val,
            'kernel' : kernel_val,
            'degree' : degree_val,
            'gamma' : gamma_val
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [1,5,10],
            'criterion' : ['gini', 'entropy']
        }
    },
    'log_regression' : {
        'model' : LogisticRegression(),
        'params' : {
            'C' : [1,5,10]
        }
    },
}

In [36]:
scores = []

for model_name,model_info in model_params.items():
    classifier = GridSearchCV(model_info['model'],model_info['params'],cv = 5,return_train_score=False)
    classifier.fit(X_train,y_train)
    scores.append({
        'model' : model_name,
        'best_score' : classifier.best_score_,
        'tuned_params' : classifier.best_params_
    })

In [37]:
df_all_model = pd.DataFrame(scores,columns = ['model','best_score','tuned_params'])
df_all_model

Unnamed: 0,model,best_score,tuned_params
0,svm,0.9625,"{'C': 20, 'degree': 2, 'gamma': 'scale', 'kern..."
1,random_forest,0.975,"{'criterion': 'gini', 'n_estimators': 10}"
2,log_regression,0.95,{'C': 5}
