# 2021 Intro. to Machine Learning 
## Program Assignment #3 - Support Vector Machine


### 0816153 陳琮方

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data Input

In [2]:
raw_data_path = './Dataset/ionosphere.data'

raw_data = pd.read_csv(raw_data_path, header = None)

print(f'Data Shape = {raw_data.shape}')

Data Shape = (351, 35)


## Data Preprocessing

In [3]:
# Shuffle
raw_data = raw_data.sample(frac=1).reset_index(drop=True)

print(f'Dataset size = {raw_data.shape}\n')

X = raw_data.iloc[:, :-1]
Y = raw_data.iloc[:, -1:]

print(f'X size: {X.shape}')
print(f'Y size: {Y.shape}')

Dataset size = (351, 35)

X size: (351, 34)
Y size: (351, 1)


### Holdout validation

In [4]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
                                        X, Y, test_size=0.30, shuffle=True, random_state=50)

## Support Vector Machine

### Parameter Search

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters_lin = { }
parameters_pol = { 'degree' : [2, 5, 8], 'gamma' : [1.0, 3.0], 'coef0' : [0.0, 2.0] }
parameters_gam = { 'gamma' : [1.0, 3.0]}

clf_lin = GridSearchCV(SVC(kernel = 'linear'), parameters_lin, cv = 5)
clf_pol = GridSearchCV(SVC(kernel = 'poly')  , parameters_pol, cv = 5)
clf_rbf = GridSearchCV(SVC(kernel = 'rbf')   , parameters_gam, cv = 5)

clf_lin.fit(X_train, Y_train)
clf_pol.fit(X_train, Y_train)
clf_rbf.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=SVC(), param_grid={'gamma': [1.0, 3.0]})

#### Linear

In [6]:
df = pd.DataFrame(clf_lin.cv_results_)
df = df[[
    'mean_test_score', 'split0_test_score',
    'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'rank_test_score'
]].sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,rank_test_score
0,0.853061,0.877551,0.857143,0.836735,0.816327,0.877551,1


#### Polynomial

In [7]:
df = pd.DataFrame(clf_pol.cv_results_)
df = df[[
    'mean_test_score', 'param_coef0', 'param_degree', 'param_gamma', 'split0_test_score',
    'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'rank_test_score'
]].sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_test_score,param_coef0,param_degree,param_gamma,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,rank_test_score
0,0.865306,0.0,2,1.0,0.877551,0.857143,0.857143,0.795918,0.938776,1
6,0.861224,2.0,2,1.0,0.877551,0.836735,0.857143,0.795918,0.938776,2
1,0.840816,0.0,2,3.0,0.77551,0.836735,0.836735,0.857143,0.897959,3
8,0.828571,2.0,5,1.0,0.816327,0.836735,0.816327,0.795918,0.877551,4
7,0.828571,2.0,2,3.0,0.77551,0.836735,0.836735,0.836735,0.857143,5
9,0.820408,2.0,5,3.0,0.795918,0.795918,0.816327,0.816327,0.877551,6
3,0.812245,0.0,5,3.0,0.816327,0.755102,0.816327,0.795918,0.877551,7
10,0.812245,2.0,8,1.0,0.795918,0.755102,0.816327,0.795918,0.897959,7
11,0.812245,2.0,8,3.0,0.795918,0.77551,0.77551,0.795918,0.918367,7
2,0.808163,0.0,5,1.0,0.816327,0.734694,0.816327,0.795918,0.877551,10


#### RBF

In [8]:
df = pd.DataFrame(clf_rbf.cv_results_)
df = df[[
    'mean_test_score', 'param_gamma', 'split0_test_score',
    'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'rank_test_score'
]].sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_test_score,param_gamma,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,rank_test_score
0,0.906122,1.0,0.979592,0.897959,0.836735,0.918367,0.897959,1
1,0.885714,3.0,0.938776,0.897959,0.795918,0.897959,0.897959,2


### Result

In [9]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score

def getResult(y_test, y_pred):
    ret = {}
    
    ret['Accuracy'] = round(accuracy_score(y_test, y_pred), 4) * 100
    ret['Recall'] = np.round(recall_score(y_test, y_pred, average=None), 2)
    ret['Precision'] = np.round(precision_score(y_test, y_pred, average=None), 2)
    ret['Confusion Matrix'] = confusion_matrix(y_test, y_pred)
    
    return ret

In [10]:
result = {}

result['Linear'] = getResult(Y_test, clf_lin.predict(X_test))
result['RBF'] = getResult(Y_test, clf_rbf.predict(X_test))
result['Polynomial'] = getResult(Y_test, clf_pol.predict(X_test))

df = pd.DataFrame.from_dict(result, orient = 'index')
df

Unnamed: 0,Accuracy,Recall,Precision,Confusion Matrix
Linear,90.57,"[0.71, 1.0]","[1.0, 0.88]","[[24, 10], [0, 72]]"
RBF,91.51,"[0.97, 0.89]","[0.8, 0.98]","[[33, 1], [8, 64]]"
Polynomial,91.51,"[0.79, 0.97]","[0.93, 0.91]","[[27, 7], [2, 70]]"


## Question
### Show the average performance of K-fold cross-validation of parameter search in tables for each kernel

The result showed in upper cells which are in "Parameter Search" section.