# Hyperparameter Tuning with SVMs


# The Tools

In [63]:
import pandas as pd


# The Data
Import breast cancer data from UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29

In [83]:
#import the raw data
raw_data = pd.read_csv("breast-cancer-wisconsin.data", 
                       names = ["id",  "Clump Thickness", "Uniformity of Cell Size","Uniformity of Cell Shape", 
                                  "Marginal Adhesion", "Single Epithelial Cell Size","Bare Nuclei", "Bland Chromatin",
                                  "Normal Nucleoli", "Mitoses", "Class"],na_values=['?'])
print(raw_data.shape)
raw_data.head()

(699, 11)


Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [84]:
raw_data.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [85]:
raw_data.Class.value_counts()

2    458
4    241
Name: Class, dtype: int64

In [86]:
raw_data.dtypes

id                               int64
Clump Thickness                  int64
Uniformity of Cell Size          int64
Uniformity of Cell Shape         int64
Marginal Adhesion                int64
Single Epithelial Cell Size      int64
Bare Nuclei                    float64
Bland Chromatin                  int64
Normal Nucleoli                  int64
Mitoses                          int64
Class                            int64
dtype: object

In [87]:
pd.set_option("display.max_rows",None)

In [88]:
raw_data["Bare Nuclei"].value_counts().to_frame().transpose()

Unnamed: 0,1.0,10.0,5.0,2.0,3.0,8.0,4.0,9.0,7.0,6.0
Bare Nuclei,402,132,30,30,28,21,19,9,8,4


In [55]:
# raw_data["Bare Nuclei"]

In [56]:
# raw_data[raw_data["Bare Nuclei"] == "?"]
# raw_data[raw_data["Bare Nuclei"] != "?"].median()

In [89]:
raw_data.isna().sum()

id                              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [28]:
raw_data['Class'].unique()

array([2, 4], dtype=int64)

In [92]:
raw_data["Bare Nuclei"].fillna(1,inplace = True)

In [93]:
raw_data["Bare Nuclei"].value_counts().to_frame().transpose()

Unnamed: 0,1.0,10.0,5.0,2.0,3.0,8.0,4.0,9.0,7.0,6.0
Bare Nuclei,418,132,30,30,28,21,19,9,8,4


In [90]:
# raw_data['Class'].map({4:1, 2:0})

In [94]:
raw_data['Class'] = raw_data['Class'].map({4:1, 2:0})
raw_data.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [95]:
from sklearn.model_selection import train_test_split

In [101]:
X_train,X_test,y_train,y_test = train_test_split(raw_data.drop("Class",axis = 1),raw_data.Class,test_size=0.3)

In [106]:
from sklearn.svm import SVC
model = SVC(kernel = "linear")

In [107]:
model.fit(X_train,y_train)

SVC(kernel='linear')

In [108]:
model.predict(X_test)

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [109]:
model.score(X_test, y_test)

0.6619047619047619

In [110]:
model_C = SVC(C=1)
model_C.fit(X_train, y_train)
model_C.score(X_test, y_test)

0.638095238095238

In [95]:
model_C = SVC(C=10)
model_C.fit(X_train, y_train)
model_C.score(X_test, y_test)

0.9781021897810219

In [96]:
model_C

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [111]:
model_g = SVC(gamma=10)
model_g.fit(X_train, y_train)
model_g.score(X_test, y_test)

0.6476190476190476

In [112]:
model_linear_kernal = SVC(kernel='linear')
model_linear_kernal.fit(X_train, y_train)
model_linear_kernal.score(X_test, y_test)

0.6619047619047619

# ADVANTAGE

# DISADVANTAGE

In [None]:
SVM algorithm is not suitable for large data sets.
SVM does not perform very well, when the data set has more noise i.e. target classes are overlapping.
In cases where number of features for each data point exceeds the number of training data sample , the SVM will under perform