In [None]:
## In this code, we will use scikit to implement SVM with a breast cancer classification dataset:
"https://archive-beta.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+diagnostic"

In [38]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.svm import SVC


In [47]:
# Data, we will use breast_cancer dataset 

breast_cancer_dataset = datasets.load_breast_cancer()
X = breast_cancer_dataset.data
Y = breast_cancer_dataset.target

print("The number of all datapoints is ", len(X))

print("The number of features per datapoint is ", len(X[0]))

# Use only 500 for training and the rest is to test the model:

X_train,X_test = X[0:500],X[500:]
Y_train,Y_test = Y[0:500],Y[500:]

print("The number of training datapoints is ", len(X_train))
print("The number of testing  datapoints is ", len(X_test))

The number of all datapoints is  569
The number of features per datapoint is  30
The number of training datapoints is  500
The number of testing  datapoints is  69


In [48]:
# Here, we define the classification model: 
classification_model = make_pipeline(StandardScaler(), SVC(kernel="linear", verbose = True))
"""
See https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html for other options of kernels
"""

# Here, we fit the model: 
classification_model.fit(X_train,Y_train)



[LibSVM]

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(kernel='linear', verbose=True))])

In [49]:
# Next, lets test the data by computing the testing dataset classification accuracy

CA_cnt = 0
for test_data_point_index in range(len(X_test)):
  test_data_point = X_test[test_data_point_index]
  # obtain the predicted label:
  predicted_label = classification_model.predict([test_data_point])
  # compare the predicted label with the true test label 
  true_label = Y_test[test_data_point_index]
  if predicted_label == true_label:
    CA_cnt = CA_cnt + 1
  
  # logger:
  #print("Testing poing index {}. [Predicted label, True label] = [{},{}]".format(test_data_point_index,predicted_label,true_label))

print("The classificaiton accuracy of the test dataset is {}%".format(100*CA_cnt/len(X_test)))


The classificaiton accuracy of the test dataset is 97.10144927536231%


H.W (for practice only): Use the "decision_function_shape" option in the SVC class to perform SVM classification using the iris dataset. You can download the data in the same manner as we did for the breast cancer dataset in this notebook. Hint: Use the document in "https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html" and select the OVA method. In scikit, it is called one-vs-rest (ovr).