In [1]:
import numpy as np
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import joblib

In [2]:
# Load dataset from npz files
dict_X = np.load('../dataset/X.npz')
dict_y = np.load('../dataset/y.npz')
# Store arrays from dictionary
X = dict_X['arr_0']
y = dict_y['arr_0']

In [3]:
# we will look first at training a model using kPCA pre-process and SVM with rbf classifier
# data split training/test sets for cross-validations
cv_splits = StratifiedShuffleSplit(n_splits=3, test_size=0.25, random_state=42)

In [4]:
# I would need a pipeline of k-PCA and SVM to do my parameter optimization/cross-validation

# initialize k-PCA with n_components (smaller number --> make the optimizer faster but less accurate solution)
n_kPCA = 10

kPCA = KernelPCA(n_components=n_kPCA, kernel='rbf', random_state=42, eigen_solver='arpack')

# initialize SVM with Radial Basis Functions (rbf) and balanced weights (although dataset is pretty balanced)
sv_clf = SVC(kernel='rbf', class_weight='balanced')

In [5]:
# Actually output from k-PCA is not univariated
# Then, k-PCA is combined with a standard scaler
scaler = StandardScaler()
scale_kPCA = make_pipeline(kPCA, scaler)
# Now the model brings together (kPCA with unit variance) and the SVM classiffier
model = make_pipeline(scale_kPCA, sv_clf)

In [6]:
# Now I'm looking into parameter tuning for different sets of (X_kPCA,y_train) using SVM classifier with RBF
# From previous analysis with PCA start with
C_range = np.linspace(1e4, 6e4, 3)
gamma_range = np.linspace(1e-3, 6e-2, 3)
# Plus, we add the gamma parameter from kPCA
kPCA_gamma = [1e-11, 5e-11, 1e-10]
# The performance test is on Grid: gamma from kPCA, C from SVM RBF and gamma from SVM RBF
sv_grid = dict(pipeline__kernelpca__gamma = kPCA_gamma, svc__gamma = gamma_range, svc__C = C_range)

In [7]:
# cross-validation parameter optimization with kPCA transformation into SVM classifier
kPCA_sv_clf = GridSearchCV(model, param_grid = sv_grid, cv = cv_splits)


In [8]:
# fit (X,y) and time
%time kPCA_sv_clf.fit(X, y)

Wall time: 9min 39s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.25,
            train_size=None),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pipeline',
                                        Pipeline(memory=None,
                                                 steps=[('kernelpca',
                                                         KernelPCA(alpha=1.0,
                                                                   coef0=1,
                                                                   copy_X=True,
                                                                   degree=3,
                                                                   eigen_solver='arpack',
                                                                   fit_inverse_transform=False,
                                                                   gamma=None,
                                                   

In [9]:
# parameter testing results
kPCA_sv_clf.best_params_

{'pipeline__kernelpca__gamma': 5e-11, 'svc__C': 35000.0, 'svc__gamma': 0.0305}

In [10]:
kPCA_sv_clf.cv_results_

{'mean_fit_time': array([4.95687246, 3.90645258, 3.82671094, 4.68822765, 3.6177907 ,
        3.65283457, 4.84519728, 3.71598427, 3.74085792, 4.53040449,
        3.93859132, 6.03402448, 6.26933217, 3.91121825, 3.73065662,
        4.89549788, 3.69085407, 3.67177248, 4.69937491, 4.70143874,
        5.04189221, 4.76914883, 3.82359989, 3.58413792, 4.67357628,
        3.75800681, 3.63572264]),
 'std_fit_time': array([1.02626903, 0.04058899, 0.17987183, 0.20753852, 0.02671841,
        0.06470819, 0.2549243 , 0.10434458, 0.14100621, 0.24835646,
        0.32755084, 0.88710501, 1.1578613 , 0.56676396, 0.05812211,
        0.35787672, 0.13752855, 0.09191286, 0.73047149, 0.7256265 ,
        0.85028587, 0.16977534, 0.35733864, 0.09675516, 0.07832751,
        0.08185235, 0.00645646]),
 'mean_score_time': array([2.68313511, 2.56743066, 2.61732515, 2.44939613, 2.50831922,
        2.5547324 , 2.62348731, 2.42965579, 2.73797695, 2.71703998,
        2.84800839, 4.42297204, 3.51977563, 2.79550227, 2.737144

In [11]:
idxRank = kPCA_sv_clf.cv_results_['rank_test_score']
for index, item in enumerate(kPCA_sv_clf.cv_results_['params']):
    print(f"{item} - {idxRank[index]}")

{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 10000.0, 'svc__gamma': 0.001} - 25
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 10000.0, 'svc__gamma': 0.0305} - 6
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 10000.0, 'svc__gamma': 0.06} - 6
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 35000.0, 'svc__gamma': 0.001} - 22
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 35000.0, 'svc__gamma': 0.0305} - 4
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 35000.0, 'svc__gamma': 0.06} - 11
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 60000.0, 'svc__gamma': 0.001} - 18
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 60000.0, 'svc__gamma': 0.0305} - 10
{'pipeline__kernelpca__gamma': 1e-11, 'svc__C': 60000.0, 'svc__gamma': 0.06} - 11
{'pipeline__kernelpca__gamma': 5e-11, 'svc__C': 10000.0, 'svc__gamma': 0.001} - 26
{'pipeline__kernelpca__gamma': 5e-11, 'svc__C': 10000.0, 'svc__gamma': 0.0305} - 2
{'pipeline__kernelpca__gamma': 5e-11, 'svc__C': 10000.0, 'svc__gamma': 0.06} - 13
{'pipeli

In [18]:
# save the model to disk
filename = f"../dataset/grid_{n_kPCA}_kPCA_3D_param_refinement.sav"
joblib.dump(kPCA_sv_clf, filename)

['../dataset/grid_10_kPCA_3D_param_refinement.sav']

In [13]:
# I would need a pipeline of k-PCA and SVM to do my parameter optimization/cross-validation

# initialize k-PCA with n_components (less number --> make the optimizer faster but less accurate solution)
# n_kPCA = 10

# kPCA = KernelPCA(n_components=n_kPCA, kernel='rbf', random_state=42, eigen_solver='arpack')

# initialize SVM with Radial Basis Functions (rbf) and balanced weights (although dataset is pretty balanced)
# optimal parameters chosen for PCA are first used here
# I will test performance on gamma kPCA parameter
# sv_clf = SVC(kernel='rbf', class_weight='balanced', C=1e4, gamma=0.004)
# model = make_pipeline(kPCA, sv_clf)

In [14]:
# Now I'm defining gamma tuning for X_train_kPCA,y_train using SVM classifier with RBF
# From previous kPCA analysis we estimated sigma --> 20000 (Wang paper)
# Here I tested for this log range first: np.logspace(-13,-3,11)*5
# Test refinement
# kPCA_gamma = np.logspace(-11,-9,9)*5
# gamma_grid = dict(kernelpca__gamma = kPCA_gamma)

In [15]:
# from previous tests I confirm gamma choice from Wang paper was a good estimation of the order
# now I do a cv test on different C and gamma parameters for the SVM classifier for a fixed gamma on kPCA

# n_kPCA = 10
# value at Wang Paper
# gamma_rbf = 1.3e-9
# My test value based on previous analysis (fixed parameters at SVC RBF)
# gamma_rbf = 5e-10
# kPCA = KernelPCA(n_components=n_kPCA, kernel='rbf', gamma=gamma_rbf, random_state=42, eigen_solver='arpack')
# initialize SVM with Radial Basis Functions (rbf) and balanced weights (although dataset is pretty balanced)
# sv_clf = SVC(kernel='rbf', class_weight='balanced')

In [16]:
# cross-validation kPCA gamma optimization with kPCA transformation into SVM classifier (with fixed parameters from PCA)
# kPCA_sv_clf = GridSearchCV(model, param_grid = gamma_grid, cv = cv_splits)

In [17]:
# fit different X train splits and time
# %time kPCA_sv_clf.fit(X, y)