To analyze the SVM approach for classification task we exploited the model provided by **scikit learn** (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC). The implementation is based on the library **libsvm** (https://www.csie.ntu.edu.tw/~cjlin/libsvm/). 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.svm import SVC

In [None]:
#to return a number with fixed numebr of significant digits

from math import log10 , floor

def round_it(x, sig):
    return round(x, sig-int(floor(log10(abs(x))))-1)

In [None]:
def open_monks(path):
    '''
    Function to open monks datasets
    Parameters
    ---
    path : str
        It's the path of the file
    Returns
    ---
    monks_df : pandas DataFrame
        the df that contains the dataset
    
    '''
    
    file = open(path, 'r')
    content = file.read().split('\n') # split to separate different data
    monks_df = pd.DataFrame([line.split(' ')[1:] for line in content][:-1]) # creation of the df using separation by ' '
    
    # The 3 lines below change names to the columns
    dict_for_rename = {0:'target', monks_df.shape[1]-1:'id'}
    dict_for_rename.update({i:i-1 for i in range(1,monks_df.shape[1]-1)})
    monks_df = monks_df.rename(columns=dict_for_rename)
    return monks_df

In [None]:
# import monk dataset

monks1_train = open_monks('MONK/monks-1.train')
monks1_test = open_monks('MONK/monks-1.test')

monks2_train = open_monks('MONK/monks-2.train')
monks2_test = open_monks('MONK/monks-2.test')

monks3_train = open_monks('MONK/monks-3.train')
monks3_test = open_monks('MONK/monks-3.test')

In [None]:
def hot_encoding(df):
    '''
    Function to implement one-hot encoding to pandas dataframe
    Parameters
    ---
    df: pandas DataFrame
    Return
    ---
    X_hot: ndarray of input encoded data
    y: array of target data
    '''
    
    target_column = df.columns[0] # Columns referred to target 
    y = df[target_column] # selecting target value for each datapoint
    y = y.values # from a pd. Dataframe to a np. array
    y = np.array(y, dtype=int) # Convert target values from string to int
    
    
    features_columns = df.columns[1:7] # Columns referred to cat. variables
    X = df[features_columns] # selecting features columns for each datapoint   
    columns = X.columns # Selecting the columns of X. These columns are just the categorical columns of df 
    X_hot = pd.get_dummies(X, columns=columns) # applying one-hot encoding to X features (from 6 dims to 17 dims)
    X_hot = X_hot.values # from a pd. Dataframe to a np. array
    

    
    return X_hot, y

def data_preparation(df):
    '''
    Function to convert the dataframe to input and target array
    Parameters
    ---
    df: pandas DataFrame
    Return
    ---
    X: ndarray of input data
    y: array of target data
    '''
    
    target_column = df.columns[0] # Columns referred to target 
    y = df[target_column] # selecting target value for each datapoint
    y = y.values # from a pd. Dataframe to a np. array
    y = np.array(y, dtype=int) # Convert target values from string to int
    
    
    features_columns = df.columns[1:7] # Columns referred to cat. variables
    X = df[features_columns].values.astype(int) # selecting features columns for each datapoint   
    
    return X, y

### First approach

- we decided to explore the following kernel: ***rbf***, ***polynomial***, ***sigmoid*** and ***linear***. It's important to specify that for Monk 1, Monk 2 and Monk 3 we used the same procedure, in particular in terms of model selection and assessment; 
- we used a Repeated stratified k-fold CV for model selection, in particular n=10 repetition and k=5;
- for model assessment we exploit an hold-out;
- However the scikitlearn library preferes encoded dataset (https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use on **1.4.5. Tips on Practical Use** paragraph), we try the same procedure used for encoded data for no-encoded data as well. Performance for encoded data are better(in terms of test accuracy) for Monk 1 and Monk 2 but the same as no-encoded data for Monk 3;

### Monk 1: data preparation

In [None]:
#no one-hot encoding
X_design_1, y_design_1 = data_preparation(monks1_train) #for model selection
X_test_1, y_test_1 = data_preparation(monks1_test) #for model assessment

# with one-hot encoding
X_design_enc_1, y_design_enc_1 = hot_encoding(monks1_train)
X_test_enc_1, y_test_enc_1 = hot_encoding(monks1_test) 

#number of samples for Monk1 dataset
N_1 = X_design_1.shape[0]  

dataset_1 = [[X_design_enc_1, y_design_enc_1, X_test_enc_1, y_test_enc_1], [X_design_1, y_design_1, X_test_1, y_test_1]]

### Monk 2: data preparation

In [None]:
#no one-hot encoding
X_design_2, y_design_2 = data_preparation(monks2_train)
X_test_2, y_test_2 = data_preparation(monks2_test) 

# with one-hot encoding
X_design_enc_2, y_design_enc_2 = hot_encoding(monks2_train) 
X_test_enc_2, y_test_enc_2 = hot_encoding(monks2_test) 

#number of samples for Monk2 dataset
N_2 = X_design_2.shape[0]

dataset_2 = [[X_design_enc_2, y_design_enc_2, X_test_enc_2, y_test_enc_2], [X_design_2, y_design_2, X_test_2, y_test_2]]

### Monk 3: data preparation

In [None]:
#no one-hot encoding
X_design_3, y_design_3 = data_preparation(monks3_train) 
X_test_3, y_test_3 = data_preparation(monks3_test) 

# with one-hot encoding
X_design_enc_3, y_design_enc_3 = hot_encoding(monks3_train)
X_test_enc_3, y_test_enc_3 = hot_encoding(monks3_test) 

#number of samples for Monk3 dataset
N_3 = X_design_3.shape[0] 


dataset_3 = [[X_design_enc_3, y_design_enc_3, X_test_enc_3, y_test_enc_3], [X_design_3, y_design_3, X_test_3, y_test_3]]

In [None]:
# first index: choice of Monk to consider -> Monk 1:0, Monk 2:1, Monk 3:2;
# second index: choice of encoded(0) and no-encodend(1) dataset;
# thrid index: choice of input,target data for model selection and assessment;
dataset = [dataset_1, dataset_2, dataset_3]

N=[N_1, N_2, N_3]


Choose which Monk you want to analyze

In [None]:
monk = 2
#after you change this value run the next block(my_grid_search(param_grid, i, monk))

**Parameters of SVC:**
-  'C': float, default=1.0
- 'kernel': {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’
- 'degree': int, default=3
- 'gamma': {‘scale’, ‘auto’} or float, default=’scale’
- 'coef0': float, default=0.0
- 'tol': float, default=1e-3
- 'shrinking': bool, default=True            
- 'max_iter': int, default=-1 (it means no limit)

**shrinking** parameter is a tool that reduces the training time respect to base algorithm (shrinking=False) leading to only small changes,  as theoretically shown in libsvm. For Monk dataset we set shrinking=False because we didn't observe huge changes( it will be very useful for Machine Learning cup). 

Assumption: others parameters like 'tol' ans 'max_iter' are fixed at their default values.

In [None]:
param_grid_rbf= {
    
    'C': [1e-1, 1, 10],    
    
    'kernel': ['poly', 'rbf', 'sigmoid'],    
    
    'gamma': [1e-3 , 1e-2, 1e-1],
    'degree': [2, 3, 4, 6],    
    'coef0':  [0, 1e-2, 1e-1],  
    'shrinking': [False]
}


In [None]:
i_= 0   # 0: encoded data, 1: no-encoded data

grid = GridSearchCV(
        SVC(),
        param_grid=param_grid,
        cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
        n_jobs=-1,
        refit=True
    )

grid.fit(dataset[monk][i_][0], dataset[monk][i_][1])

In [None]:
# useful dataframes for next analysis

a = pd.DataFrame(grid_rbf.cv_results_)

b = a_rbf[(a['rank_test_score'] == 1)]

### Different approach

The following code allow us to explore separately each kernel we analyzed previously(i.e. a different GridSerach for each kernel). Moreover we further study the linear kernel. This new approach allow us to avoid useless search, indeed each kernel ignores some of available hyperparameters producing redundant search. Study separately each kernel leads to deeper analysis about the different kernel and in generale the SVC. For example we toke into account the analysis of fraction of support vectors with the goal to choose a model that guarantees good efficiency, in terms of data needed to get a prediction. 

Assumption: we restrict our analysis to encoded dataset.

### Model selection and assessment

In [None]:
def my_grid_search(param_grid, i):
    
    grid = GridSearchCV(
        SVC(),
        param_grid=param_grid,
        cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )

    return grid.fit(dataset[monk][i][0], dataset[monk][i][1])

### RBF kernel

In [None]:
# 1) choice of hyperpar.'s ranges to explore

C_interval_rbf = np.logspace(-4, 4, 9)
gamma_interval_rbf = np.logspace(-4, 4, 9)

In [None]:
%%time
# 3) grid search

i = 0  # 0: encoded data, 1: no-encoded data


param_grid_rbf= {
    
    'C': C_interval_rbf,    
    
    'kernel': ['rbf'],  
    
    'gamma': gamma_interval_rbf,
    
    'shrinking': [False]
}

grid_rbf = my_grid_search(param_grid_rbf, i) # grid search already fitted

In [None]:
# 4) useful dataframes for next analysis

a_rbf = pd.DataFrame(grid_rbf.cv_results_)

b_rbf = a_rbf[(a_rbf['rank_test_score'] == 1)]

In [None]:
# useful code for tasks 5) and 7)

def heatmap_rbf(a, score_var):
    
    k = a[['param_C', 'param_gamma', score_var]]
    glue = k.pivot('param_C', 'param_gamma', score_var)
    sns.heatmap(glue, cmap="crest", linewidth=.5, annot=True) #annot can be removed if numbers in the cells are bulky
    plt.tight_layout()
    plt.show()

In [None]:
# 5) visualization of rbf kernel's grid search(by heatmap): C vs gamma (REALLY INTERESTING!)

heatmap_rbf(a_rbf, 'mean_test_score')

In [None]:
# useful code for following tasks: 6) and 9)

def plot_rbf(a, b, fix_1, var_1, score_var):
    X = np.unique(b[fix_1]) 

    for j in range(len(X)):
        plt.figure(j)
        plt.plot(
            a[(a[fix_1] == X[j])][var_1],
            a[(a[fix_1] == X[j])][score_var]
        )
        
        plt.scatter( 
            a[(a[fix_1] == X[j])][var_1],
            a[(a[fix_1] == X[j])][score_var],
            color='b'
        )

        plt.scatter( 
            b[(b[fix_1] == X[j])][var_1],
            b[(b[fix_1] == X[j])][score_var],
            label='best models',
            color='r'
        )

        plt.title('{}={}'.format(fix_1,round_it(X[j], 3)))
        plt.xlabel(var_1)
        plt.ylabel(score_var)
        plt.xscale('log')
        plt.grid()


    plt.show()

In [None]:
# 6)

score_var='mean_test_score'
fix_1='param_gamma'
var_1='param_C'

plot_rbf(a_rbf, b_rbf, fix_1, var_1, score_var)

we can see that the models with the best validation accuracy has the expected relationship between C and gamma in order to
avoid underfitting/overfitting:
- high gamma requires  low C (to solve possible overfitting)
- low gamma requires high C (to solve possible underfitting)
- moreover from this grid search visualization I immediately understand what is the hyp. subspace to explore better
- For Monk 3: mean of support vectors among the whole grid (a_rbf) and among best model(b_rbf) according to valuation accuracy; in particular we look at sv_a_1 and sv_b_2:
    - sv_a_1 =110 +-26
    - sv_b_1 = 66 +- 19
    
    - sv_a_2 = 79+-40
    - sv_b_2 = 56+-20 

it seems like the best models doesn't have a lowest number of support vectors...
Furthermore for this kernel we can observe better this behaviour from heat map for num. support vectors (look at 8) analysis)


In [None]:
# code for support vector analysis

def sv_analysis_rbf(df):
    hyper_pars = df[['param_C', 'param_gamma']].values
    n_support = []
    model_list = []
    sv_matrix = []

    for j in range(len(hyper_pars)):
        svc = SVC(
            C = hyper_pars[j][0],

            kernel='rbf',

            gamma = hyper_pars[j][1], 

            shrinking = False
        )
        svc.fit(dataset[monk][i][0], dataset[monk][i][1])

        sv_matrix.append(np.array([

                hyper_pars[j][0], 
                hyper_pars[j][1], 
                svc.n_support_.sum()/N[monk]
            ]))
        model_list.append(svc)

    sv_matrix =pd.DataFrame(np.array(sv_matrix))
    sv_matrix = sv_matrix.rename(mapper={0:'param_C', 1:'param_gamma', 2:'fsv'}, axis=1)
    return sv_matrix, model_list

In [None]:
# 7) support vectors analysis for each cell of the grid search **by heatmap**

sv_a = sv_analysis_rbf(a_rbf)[0]

heatmap_rbf(sv_a, 'fsv')


In [None]:
# 8) among the best models(with the best validation accuracy) we select the one with the lowest nsv (number of support vector)

sv_b, model_list_b = sv_analysis_rbf(b_rbf)
rbf_final = model_list_b[np.argmin(sv_b['fsv'])] 
print('best model chosen:\n{}'.format(rbf_final))
print('number of support vectors:\n{}'.format(rbf_final.n_support_.sum()))
print('fraction of support vectors:\n{}'.format(round_it(rbf_final.n_support_.sum()/N[monk], 2)))

print('validation accuracy:\n{}'.format(round_it(b_rbf.iloc[np.argmin(sv_b['fsv'])]['mean_test_score'], 3)) )
print('test accuracy:\n{}'.format(round_it(rbf_final.score(dataset[monk][i][2], dataset[monk][i][3]), 3)) )


In [None]:
# 9) 

sv_a, sv_b = sv_analysis_rbf(a_rbf)[0], sv_analysis_rbf(b_rbf)[0]
score_var='n_support_vectors'
fix_1='param_gamma'
var_1='param_C'

plot_rbf(sv_a, sv_b, fix_1, var_1, score_var)

### Polynomial kernel

In [None]:
# 1) choice of hyperpar.'s ranges to explore

C_interval_poly = np.logspace(-3, 1, 5)
gamma_interval_poly = np.logspace(-1, 3, 5)
degree_interval = np.array(range(2, 8, 2))
coef0_interval_poly = - np.logspace(-3, 3, 7)

For Monk 3 setting odd values for degree cerate a lot of problems. For this reason we exploited only even values( with good performances)

In [None]:
# 2)

param_grid_poly= {
    'C': C_interval_poly, 
    
    'kernel': ['poly'],    
    
    'gamma': gamma_interval_poly, 
    'degree': degree_interval,    
    'coef0':  coef0_interval_poly,  
    'shrinking': [True]
}

In [None]:
%%time
# 3) grid search

i = 0  # 0:no data encoding, 1: yes data encoding
grid_poly = my_grid_search(param_grid_poly, i) # grid search already fitted

In [None]:
# 4) useful dataframes for next analysis

a_poly = pd.DataFrame(grid_poly.cv_results_)
b_poly = a_poly[(a_poly['rank_test_score'] == 1)]

In [None]:
# useful code for tasks 6) and 8.A)

def heatmap_poly(a, b, fix_1, fix_2, var_1, var_2, score_var):



    X = np.unique(b[[fix_1, fix_2]].values.astype(None), axis=0)

    for j in range(len(X)):
        plt.figure(j)

        matrix = a[(a[fix_1] == X[j][0]) & (a[fix_2] == X[j][1])]
        matrix = matrix[[var_1, var_2, score_var]]
    
        glue = matrix.pivot(var_1, var_2, score_var)
        sns.heatmap(glue, cmap="crest", linewidth=.5)
        plt.title('{}={}, {}={}'.format(fix_1, X[j][0], fix_2, X[j][1]))

    plt.show()  


In [None]:
# 6) varying 2 hyp. and fixing the others(at the values for the best validation_accuracy)

score_var='mean_test_score'

fix_1= 'param_coef0'
fix_2= 'param_degree'

var_1= 'param_C'
var_2= 'param_gamma'

heatmap_poly(a_poly, b_poly, fix_1, fix_2, var_1, var_2, score_var)

In [None]:
# code for support vectors analysis: 
# 7) using b_poly, choice of the model with lowest fraction of support vector
# 8) using a_poly, number of support vector for each model included in the grid search

def sv_analysis_poly(df):
    
    hyper_pars = df[['param_C', 'param_gamma', 'param_degree', 'param_coef0']].values
    n_support = []
    model_list = []
    sv_matrix = []

    for j in range(len(hyper_pars)):
        svc = SVC(
            C = hyper_pars[j][0],

            kernel='poly',

            gamma = hyper_pars[j][1],
            degree = hyper_pars[j][2],
            coef0 = hyper_pars[j][3],

            shrinking = False
        )
        svc.fit(dataset[monk][i][0], dataset[monk][i][1])

        sv_matrix.append(
            np.array([hyper_pars[j][0], 
                      hyper_pars[j][1], 
                      hyper_pars[j][2],
                      hyper_pars[j][3],
                      svc.n_support_.sum()/N[monk]
                     ])
        )

        model_list.append(svc)

    sv_matrix = pd.DataFrame(np.array(sv_matrix))
    sv_matrix = sv_matrix.rename(mapper={
        0:'param_C', 1:'param_gamma', 2:'param_degree', 3:'param_coef0', 4:'fsv'}, axis=1)
    return sv_matrix, model_list

**Some notes:**
- mean of support vectors among the whole grid (a_sigmoid) and among best model(b_sigmoid) according to valuation accuracy, in particular we look at sv_a_1 and sv_b_2:
    - sv_a_1 = 93+-29
    - sv_b_1 = 99+-0.0 (3 models with same num. of sv)
    
    - sv_a_2 = 101+-27
    - sv_b_2 = 99+-0.0 (9 models with same num. of sv) 

it seems like the best models don't have a lowest number of support vectors...

In [None]:
# 7) choice of the model with lowest number of support vectors

sv_b, model_list_b = sv_analysis_poly(b_poly)

poly_final = model_list_b[np.argmin(sv_b['fsv'])] 
print('best model we choose:\n{}'.format(poly_final))
print('fraction of support vectors:\n{}'.format(poly_final.n_support_.sum()/N[monk]))
print('validation accuracy:\n{}'.format(round_it(b_poly.iloc[np.argmin(sv_b['fsv'])]['mean_test_score'], 3)) )
print('test accuracy:\n{}'.format(poly_final.score(dataset[monk][i][2], dataset[monk][i][3])) )


In [None]:
# 8.a) number of support vectors for each model included in the grid search **by heatmap**

sv_a, sv_b = sv_analysis_poly(a_poly)[0], sv_analysis_poly(b_poly)[0]

score_var='n_support_vectors'

fix_1 = 'param_C'
fix_2 = 'param_coef0'

var_1= 'param_degree'
var_2= 'param_gamma'

heatmap_poly(sv_a, sv_b, fix_1, fix_2, var_1, var_2, score_var)

In [None]:
# 8.b) number of support vectors for each model included in the grid search **by plot of one attribute**

sv_a, sv_b = sv_analysis_poly(a_poly)[0], sv_analysis_poly(b_poly)[0]

score_var='n_support_vectors'

fix_1= 'param_degree'
fix_2= 'param_coef0'
fix_3= 'param_C'

var_1= 'param_gamma'

plot_poly(sv_a, sv_b, fix_1, fix_2, fix_3, var_1, score_var, 'log')

### Sigmoid kernel

In [None]:
# it's not easy to visualize from b_sigmoid what are the best hyp so:

print('C best:\n{}'.format(np.unique(b_sigmoid['param_C'])))
print('gamma best:\n{}'.format(np.unique(b_sigmoid['param_gamma'])))
print('coef0 best:\n{}'.format(np.unique(b_sigmoid['param_coef0'])))

#I use these output in order to set a better grid search

In [None]:
# 1) choice of hyperpar.'s ranges to explore

C_interval_sigmoid = np.logspace(-3, 3, 7)
gamma_interval_sigmoid = np.logspace(-3, 3, 7)
coef0_interval_sigmoid = np.logspace(-3, 3, 7)

In [None]:
# 2)

param_grid_sigmoid= {
    
    'C': C_interval_sigmoid, 
    
    'kernel': ['sigmoid'],  
    
    'gamma': gamma_interval_sigmoid,
    'coef0': coef0_interval_sigmoid,  
    
    'shrinking': [False]
}

In [None]:
%%time
# 3) grid search

i = 0  # 0:no data encoding, 1: yes data encoding
grid_sigmoid = my_grid_search(param_grid_sigmoid, i) # grid search already fitted

In [None]:
# 4) useful dataframes for next analysis

a_sigmoid = pd.DataFrame(grid_sigmoid.cv_results_)

b_sigmoid = a_sigmoid[(a_sigmoid['rank_test_score'] == 1)]

In [None]:
# useful code for tasks 5) and 8.b)

def plot_sigmoid(a, b, fix_1, fix_2, var_1, score_var, scale):

    X = np.unique(b[[fix_1 , fix_2]].values.astype(None), axis=0) # it avoids to repeat the same rows

    for j in range(len(X)): 
        plt.figure(j)
        plt.plot(
            a[(a[fix_1] == X[j][0]) & (a[fix_2] == X[j][1])][var_1],
            a[(a[fix_1] == X[j][0]) & (a[fix_2] == X[j][1])][score_var], 
        )
        
        plt.scatter( 
            a[(a[fix_1] == X[j][0]) & (a[fix_2] == X[j][1])][var_1],
            a[(a[fix_1] == X[j][0]) & (a[fix_2] == X[j][1])][score_var],
            color='b'
        )

        plt.scatter( 
            b[(b[fix_1] == X[j][0]) & (b[fix_2] == X[j][1])][var_1],
            b[(b[fix_1] == X[j][0]) & (b[fix_2] == X[j][1])][score_var],
            label='best models',
            color='r'
        )
        plt.title('{}={}, {}={}'.format(fix_1, X[j][0], fix_2, X[j][1]))
        plt.xlabel(var_1)
        plt.ylabel(score_var)
        plt.xscale(scale)  #pay attention when you set degree!
        plt.legend()
        plt.grid()

    plt.show() 


In [None]:
# 5) (plot) one hyp. vs validatio_accuracy fixing the others hyp. (at values that maximize the validation_accuracy)

score_var='mean_test_score'

fix_1= 'param_coef0'
fix_2= 'param_C'

var_1= 'param_gamma'

plot_sigmoid(a_sigmoid, b_sigmoid, fix_1, fix_2, var_1, score_var,'log')

In [None]:
#useful code for tasks 6) and 8.A)

def heatmap_sigmoid(a, b, fix_1, var_1, var_2, score_var):
    X = np.unique(b[fix_1].values)  #values for fixed attribute

    for j in range(len(X)):

        plt.figure(j)
        
        matrix = a[(a[fix_1] == X[j])]
        matrix = matrix[[var_1, var_2, score_var]]
        #heatmap in practice
        glue = matrix.pivot(var_1, var_2, score_var)
        sns.heatmap(glue, cmap="crest", linewidth=.5)
        plt.title('for {}_best={}'.format(fix_1, X[j]))

    plt.show()  

In [None]:
# 6) (heatmap) varying 2 hyp. and fixing the third one (at the values for the best validation_accuracy)

score_var='mean_test_score'

fix_1 = 'param_C'

var_1= 'param_coef0'
var_2= 'param_gamma'

heatmap_sigmoid(a_sigmoid, b_sigmoid, fix_1, var_1, var_2, score_var)

In [None]:
# code for support vectors analysis: 
# 7) using b_sigmoid, choice of the model with lowest number of support vector
# 8) using a_sigmoid, number of support vector for each model included in the grid search

def sv_analysis_sigmoid(df):
    
    hyper_pars = df[['param_C', 'param_gamma', 'param_coef0']].values
    n_support = []
    model_list = []
    sv_matrix = []

    for j in range(len(hyper_pars)):
        svc = SVC(
            C = hyper_pars[j][0],

            kernel='sigmoid',

            gamma = hyper_pars[j][1],
            coef0 = hyper_pars[j][2],

            shrinking = False
        )
        svc.fit(dataset[monk][i][0], dataset[monk][i][1])

        sv_matrix.append(
            np.array([hyper_pars[j][0], 
                      hyper_pars[j][1], 
                      hyper_pars[j][2],
                      svc.n_support_.sum()/N[monk]
                     ])
        )

        model_list.append(svc)

    sv_matrix = pd.DataFrame(np.array(sv_matrix))
    sv_matrix = sv_matrix.rename(mapper={0:'param_C', 1:'param_gamma', 2:'param_coef0', 3:'fsv'}, axis=1)
    return sv_matrix, model_list

**Some notes:**
- we can observe that there isn't a simmetry respect to coef0 sign (look at b_sigmoid)
- mean of support vectors among the whole grid (a_sigmoid) and among best model(b_sigmoid) according to valuation accuracy, in particular we look at sv_a_1 and sv_b_2:
    - sv_a_1 = 99+-30
    - sv_b_1 = 58+-25
    - sv_a_2 = 89+-34
    - sv_b_2 = 53+-22

it seems like the best models has a lower number of support vectors...

In [None]:
# 7) choice of the model with lowest number of support vector

sv_b, model_list_b = sv_analysis_sigmoid(b_sigmoid)

sigmoid_final = model_list_b[np.argmin(sv_b['fsv'])] 
print('best model choosen:\n{}'.format(sigmoid_final))
print('number of support vectors:\n{}'.format(sigmoid_final.n_support_.sum()))
print('fraction of support vectors:\n{}'.format(round_it(sigmoid_final.n_support_.sum()/N[monk],2)))
print('validation accuracy:\n{}'.format(round_it(b_sigmoid.iloc[np.argmin(sv_b['fsv'])]['std_test_score'], 3)) )
print('test accuracy:\n{}'.format(round_it(sigmoid_final.score(dataset[monk][i][2], dataset[monk][i][3]), 4)) )

In [None]:
# 8.a) number of support vector for each model included in the grid search **by heatmap**

sv_a, sv_b = sv_analysis_sigmoid(a_sigmoid)[0], sv_analysis_sigmoid(b_sigmoid)[0]

score_var='fsv'

fix_1 = 'param_C'

var_1= 'param_coef0'
var_2= 'param_gamma'

heatmap_sigmoid(sv_a, sv_b, fix_1, var_1, var_2, score_var)

In [None]:
# 8.B)

sv_a, sv_b = sv_analysis_sigmoid(a_sigmoid)[0], sv_analysis_sigmoid(b_sigmoid)[0]

score_var='n_support_vectors'

fix_1= 'param_coef0'
fix_2= 'param_C'

var_1= 'param_gamma'

plot_sigmoid(sv_a, sv_b, fix_1, fix_2, var_1, score_var,'log')

### Linear kernel

In [None]:
# 1) choice of hyperpar.'s ranges to explore

C_interval_linear = np.logspace(-6, 6, 13) 

In [None]:
# 2) 
param_grid_linear= {
    
    'C': C_interval_linear, #must be strictly positive     
    
    'kernel': ['linear'],                   
    
    'shrinking': [False]
}

In [None]:
%%time
# 3) grid search in practice, first run the previous code!

i = 0  # 0:no data encoding, 1: yes data encoding
grid_linear = my_grid_search(param_grid_linear, i) # grid search already fitted

In [None]:
# 4) useful dataframes for further analysis

a_linear = pd.DataFrame(grid_linear.cv_results_)

b_linear = a_linear[(a_linear['rank_test_score'] == 1)]

**Some notes:**
- mean of support vectors among the whole grid (a_linear) and among best model(b_linear) according to valuation accuracy, in particular we look at sv_a_1 and sv_b_2:
    - sv_a_1 = 82 +- 34
    - sv_b_1 = 67 +- 23
    - sv_a_2 = 65+-23
    - sv_b_2 = 55+-6

it seems like the best models have a lowest number of support vectors... BUT look at 8.b) increasing C lowest number of support vectors

In [None]:
# useful code for task 5) and 8.b)

def plot_linear(a, b, score_var):
    
    plt.plot(a['param_C'], a[score_var])
    plt.xlabel('param_C')
    plt.scatter(a['param_C'], a[score_var], color='b')
    plt.scatter(b['param_C'], b[score_var], color='r', label='best models')
    plt.legend()
    plt.grid()
    plt.ylabel(score_var)
    plt.xscale('log')

    plt.show()

In [None]:
# 5) C vs validation_accuracy

score_var= 'mean_test_score'

plot_linear(a_linear, b_linear, score_var)

In [None]:
# code for support vectors analysis: 
# 7) using b_linear , choice of the model with lowest number of support vector
# 8) using a_linear, number of support vector for each model included in the grid search

def sv_analysis_linear(df):
    hyper_pars = np.array(df['param_C'])
    sv_matrix = []
    model_list = []

    for j in range(len(hyper_pars)):
        svc = SVC(
            C = hyper_pars[j],

            kernel='linear',

            shrinking = False
        )
        svc.fit(dataset[monk][i][0], dataset[monk][i][1])

        sv_matrix.append(np.array([hyper_pars[j],svc.n_support_.sum()/N[monk]]))
        model_list.append(svc)

    sv_matrix = pd.DataFrame(np.array(sv_matrix)).rename(mapper={
    0:'param_C', 1:'fsv'}, axis=1)
    return sv_matrix, model_list


In [None]:
# 7) performance and others information about model with best validation_accuracy and lowest number of support vectors

sv_b, model_list_b = sv_analysis_linear(b_linear)
linear_final = model_list_b[np.argmin(sv_b['fsv'])] 

print('best model chosen:\n{}'.format(linear_final))
print('number of support vectors:\n{}'.format(linear_final.n_support_.sum()))
print('fraction of support vectors:\n{}'.format(linear_final.n_support_.sum()/N_3))
print('test accuracy:\n{}'.format(linear_final.score(dataset[monk][i][2], dataset[monk][i][3])) )

In [None]:
# 8.b)

sv_a, sv_b = sv_analysis_linear(a_linear)[0], sv_analysis_linear(b_linear)[0]
score_var= 'fsv'

plot_linear(sv_a, sv_b, score_var)