# Support Vector Implementation

In [1]:
# Read data_Set
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('hepatitis.csv', na_values='?')
data.head()


Unnamed: 0,ID,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,...,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,1,2,30,2,1.0,2,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,2,50,1,1.0,2,1.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,3,2,78,1,2.0,2,1.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,4,2,31,1,,1,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,5,2,34,1,2.0,2,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [3]:
data.shape


(155, 21)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          155 non-null    int64  
 1   target      155 non-null    int64  
 2   age         155 non-null    int64  
 3   gender      155 non-null    int64  
 4   steroid     154 non-null    float64
 5   antivirals  155 non-null    int64  
 6   fatigue     154 non-null    float64
 7   malaise     154 non-null    float64
 8   anorexia    154 non-null    float64
 9   liverBig    145 non-null    float64
 10  liverFirm   144 non-null    float64
 11  spleen      150 non-null    float64
 12  spiders     150 non-null    float64
 13  ascites     150 non-null    float64
 14  varices     150 non-null    float64
 15  bili        149 non-null    float64
 16  alk         126 non-null    float64
 17  sgot        151 non-null    float64
 18  albu        139 non-null    float64
 19  protime     88 non-null     f

In [5]:
data.isna().sum()

ID             0
target         0
age            0
gender         0
steroid        1
antivirals     0
fatigue        1
malaise        1
anorexia       1
liverBig      10
liverFirm     11
spleen         5
spiders        5
ascites        5
varices        5
bili           6
alk           29
sgot           4
albu          16
protime       67
histology      0
dtype: int64

In [6]:
data.drop(['ID'], axis=1, inplace=True)

#### type casting

In [7]:
cat_cols = data.columns[data.nunique()<5]
cat_cols

Index(['target', 'gender', 'steroid', 'antivirals', 'fatigue', 'malaise',
       'anorexia', 'liverBig', 'liverFirm', 'spleen', 'spiders', 'ascites',
       'varices', 'histology'],
      dtype='object')

In [8]:
num_cols = data.columns[data.nunique() >= 5]
num_cols

Index(['age', 'bili', 'alk', 'sgot', 'albu', 'protime'], dtype='object')

In [9]:
data[cat_cols] = data[cat_cols].astype("category")

#### Independent and Target variable split

In [10]:
X, y = data.drop(['target'], axis=1), data['target']
X.head()

Unnamed: 0,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [11]:
y.shape

(155,)

#### Train and test data split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
# y_train = pd.DataFrame(y_train)
# y_test = pd.DataFrame(y_test)
y_train.value_counts()

2    99
1    25
Name: target, dtype: int64

In [14]:
y_test.value_counts()

2    24
1     7
Name: target, dtype: int64

#### Impute categorical missing data

In [15]:
cat_cols = cat_cols.drop('target')
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(x_train[cat_cols])
train_cat_df = cat_imputer.transform(x_train[cat_cols])
train_cat_df = pd.DataFrame(train_cat_df, columns=cat_cols)
train_cat_df.isna().sum()

gender        0
steroid       0
antivirals    0
fatigue       0
malaise       0
anorexia      0
liverBig      0
liverFirm     0
spleen        0
spiders       0
ascites       0
varices       0
histology     0
dtype: int64

In [16]:
train_cat_df.head()

Unnamed: 0,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,histology
0,1,1,2,1,1,2,2,1,2,1,2,2,2
1,1,2,2,2,2,2,2,2,1,2,2,2,2
2,1,1,2,2,2,2,2,2,2,2,2,2,1
3,1,1,2,1,1,2,2,2,2,2,1,2,2
4,1,2,2,2,2,2,2,2,2,2,2,2,1


#### Impute numerical missing data

In [17]:
num_imputer = KNNImputer()
num_imputer.fit(x_train[num_cols])
train_num_df = num_imputer.transform(x_train[num_cols])
train_num_df = pd.DataFrame(train_num_df, columns=num_cols)
train_num_df.isna().sum()

age        0
bili       0
alk        0
sgot       0
albu       0
protime    0
dtype: int64

In [18]:
#impute missing values in test data
test_cat_df = cat_imputer.transform(x_test[cat_cols])
test_cat_df = pd.DataFrame(test_cat_df, columns=cat_cols)
test_cat_df.isna().sum()

gender        0
steroid       0
antivirals    0
fatigue       0
malaise       0
anorexia      0
liverBig      0
liverFirm     0
spleen        0
spiders       0
ascites       0
varices       0
histology     0
dtype: int64

In [19]:
test_num_df = num_imputer.transform(x_test[num_cols])
test_num_df = pd.DataFrame(test_num_df, columns=num_cols)
test_num_df.isna().sum()

age        0
bili       0
alk        0
sgot       0
albu       0
protime    0
dtype: int64

#### Concat num + cat features

In [20]:
imputed_train = pd.concat([train_cat_df, train_num_df], axis=1)
imputed_train.head()

Unnamed: 0,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,histology,age,bili,alk,sgot,albu,protime
0,1,1,2,1,1,2,2,1,2,1,2,2,2,30.0,0.8,147.0,128.0,3.9,100.0
1,1,2,2,2,2,2,2,2,1,2,2,2,2,42.0,1.5,85.0,40.0,4.16,82.4
2,1,1,2,2,2,2,2,2,2,2,2,2,1,30.0,0.7,100.0,31.0,4.0,100.0
3,1,1,2,1,1,2,2,2,2,2,1,2,2,33.0,0.7,63.0,80.0,3.0,31.0
4,1,2,2,2,2,2,2,2,2,2,2,2,1,27.0,0.8,82.8,38.0,4.2,79.4


In [21]:
imputed_test = pd.concat([test_cat_df, test_num_df], axis=1)
imputed_test.head()

Unnamed: 0,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,histology,age,bili,alk,sgot,albu,protime
0,1,2,2,2,2,2,2,2,2,2,2,2,1,36.0,0.7,62.0,224.0,4.2,100.0
1,1,2,2,1,2,2,2,1,1,1,2,1,2,51.0,1.0,82.8,20.0,3.0,63.0
2,1,1,2,1,1,2,2,2,2,2,2,2,1,62.0,1.0,143.6,60.0,3.22,42.2
3,1,1,1,1,1,2,2,2,2,2,2,2,1,51.0,1.0,78.0,58.0,4.6,52.0
4,1,1,2,1,1,2,2,2,2,1,2,2,2,61.0,1.32,103.0,56.0,3.7,55.6


In [22]:
dummy_imputed_train = pd.get_dummies(imputed_train, drop_first=True)
dummy_imputed_test = pd.get_dummies(imputed_test, drop_first=True)

In [23]:
dummy_imputed_train

Unnamed: 0,age,bili,alk,sgot,albu,protime,gender_2,steroid_2.0,antivirals_2,fatigue_2.0,malaise_2.0,anorexia_2.0,liverBig_2.0,liverFirm_2.0,spleen_2.0,spiders_2.0,ascites_2.0,varices_2.0,histology_2
0,30.0,0.80,147.0,128.0,3.90,100.0,0,0,1,0,0,1,1,0,1,0,1,1,1
1,42.0,1.50,85.0,40.0,4.16,82.4,0,1,1,1,1,1,1,1,0,1,1,1,1
2,30.0,0.70,100.0,31.0,4.00,100.0,0,0,1,1,1,1,1,1,1,1,1,1,0
3,33.0,0.70,63.0,80.0,3.00,31.0,0,0,1,0,0,1,1,1,1,1,0,1,1
4,27.0,0.80,82.8,38.0,4.20,79.4,0,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,34.0,2.80,127.0,182.0,3.80,50.0,0,0,1,0,0,1,0,0,1,0,1,1,0
120,35.0,1.50,138.0,58.0,2.60,56.0,0,0,1,0,1,1,1,1,0,0,0,1,1
121,47.0,2.08,83.0,60.0,3.70,41.2,0,0,0,1,1,1,1,1,1,1,1,1,0
122,33.0,1.00,158.4,60.0,4.00,64.6,0,1,1,1,1,1,1,1,1,1,1,1,1


In [24]:
dummy_imputed_test

Unnamed: 0,age,bili,alk,sgot,albu,protime,gender_2,steroid_2.0,antivirals_2,fatigue_2.0,malaise_2.0,anorexia_2.0,liverBig_2.0,liverFirm_2.0,spleen_2.0,spiders_2.0,ascites_2.0,varices_2.0,histology_2
0,36.0,0.7,62.0,224.0,4.2,100.0,0,1,1,1,1,1,1,1,1,1,1,1,0
1,51.0,1.0,82.8,20.0,3.0,63.0,0,1,1,0,1,1,1,0,0,0,1,0,1
2,62.0,1.0,143.6,60.0,3.22,42.2,0,0,1,0,0,1,1,1,1,1,1,1,0
3,51.0,1.0,78.0,58.0,4.6,52.0,0,0,0,0,0,1,1,1,1,1,1,1,0
4,61.0,1.32,103.0,56.0,3.7,55.6,0,0,1,0,0,1,1,1,1,0,1,1,1
5,37.0,0.7,26.0,58.0,4.5,100.0,0,1,1,1,1,1,1,1,1,1,1,1,0
6,52.0,1.5,104.0,69.0,2.9,43.0,0,0,1,1,1,1,1,1,1,1,1,1,1
7,70.0,1.7,109.0,528.0,2.8,35.0,0,0,1,0,0,0,1,1,1,1,1,1,1
8,53.0,1.5,81.0,19.0,4.1,48.0,1,0,1,0,1,1,1,1,0,0,1,0,1
9,39.0,1.0,34.0,15.0,4.0,54.0,0,1,1,0,1,1,1,1,1,1,1,1,0


#### scale data

In [25]:
dummy_imputed_train[num_cols].head()

Unnamed: 0,age,bili,alk,sgot,albu,protime
0,30.0,0.8,147.0,128.0,3.9,100.0
1,42.0,1.5,85.0,40.0,4.16,82.4
2,30.0,0.7,100.0,31.0,4.0,100.0
3,33.0,0.7,63.0,80.0,3.0,31.0
4,27.0,0.8,82.8,38.0,4.2,79.4


In [26]:
scaler = StandardScaler()
scaler.fit(x_train[num_cols])
dummy_imputed_train[num_cols] = scaler.transform(dummy_imputed_train[num_cols]) 
dummy_imputed_train[num_cols].head()

Unnamed: 0,age,bili,alk,sgot,albu,protime
0,-0.844602,-0.554052,0.81099,0.482862,0.112699,1.521833
1,0.090493,0.039168,-0.467988,-0.515102,0.509344,0.794304
2,-0.844602,-0.638798,-0.158558,-0.617167,0.265255,1.521833
3,-0.610829,-0.638798,-0.921819,-0.061482,-1.260303,-1.330411
4,-1.078376,-0.554052,-0.513371,-0.537783,0.570366,0.670293


In [27]:
dummy_imputed_test[num_cols].head()

Unnamed: 0,age,bili,alk,sgot,albu,protime
0,36.0,0.7,62.0,224.0,4.2,100.0
1,51.0,1.0,82.8,20.0,3.0,63.0
2,62.0,1.0,143.6,60.0,3.22,42.2
3,51.0,1.0,78.0,58.0,4.6,52.0
4,61.0,1.32,103.0,56.0,3.7,55.6


In [28]:
dummy_imputed_test[num_cols] = scaler.transform(dummy_imputed_test[num_cols]) 
dummy_imputed_test[num_cols].head()

Unnamed: 0,age,bili,alk,sgot,albu,protime
0,-0.377055,-0.638798,-0.942448,1.571549,0.570366,1.521833
1,0.791815,-0.38456,-0.513371,-0.741912,-1.260303,-0.007631
2,1.648986,-0.38456,0.740853,-0.288292,-0.92468,-0.867438
3,0.791815,-0.38456,-0.612389,-0.310973,1.180589,-0.462337
4,1.571061,-0.113374,-0.096672,-0.333654,-0.192413,-0.313524


#### MODEL BUILDING

In [29]:
# C (default=1.0) 
# kernel(default=’rbf’)
# degree (default=3)
# gamma {‘scale’, ‘auto’}
svm_param = {'C':range(1,5), 'kernel':['linear','ploy','rbf'], 'degree':range(1,4), 'gamma':[0.1,1,2,3]}
model1 = SVC()
gridserachcv = GridSearchCV(model1, svm_param, verbose=10, n_jobs=-1)
gridserachcv.fit(dummy_imputed_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1903s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0678s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0747s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed: 

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': range(1, 5), 'degree': range(1, 4),
                         'gamma': [0.1, 1, 2, 3],
                         'kernel': ['linear', 'ploy', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

#### MODEL EVALUATION

In [30]:
gridserachcv.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
gridserachcv.best_params_

{'C': 1, 'degree': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [32]:
y_train_pred = gridserachcv.predict(dummy_imputed_train)
y_test_pred = gridserachcv.predict(dummy_imputed_test)
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           1       1.00      0.76      0.86        25
           2       0.94      1.00      0.97        99

    accuracy                           0.95       124
   macro avg       0.97      0.88      0.92       124
weighted avg       0.95      0.95      0.95       124



In [33]:
print(f'TRAIN DATA ACCURACY: {accuracy_score(y_train,y_train_pred)}')

TRAIN DATA ACCURACY: 0.9516129032258065


In [34]:
print(f'TEST DATA ACCURACY: {accuracy_score(y_test, y_test_pred)}')

TEST DATA ACCURACY: 0.7741935483870968
