In [3]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import SVC

In [4]:
# Here perform one analysis based on the best CV
# Load dataset from npz files
dict_X = np.load('../dataset/X.npz')
dict_y = np.load('../dataset/y.npz')
# Store arrays from dictionary
X = dict_X['arr_0']
y = dict_y['arr_0']

In [5]:
# data split training/test sets for cross-validations
cv_splits = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

In [6]:
# pipeline of PCA and SVM for cross-validation over different training/test samples

# initialize PCA with n_components
n_PCA = 50
pca = PCA(n_components=n_PCA, whiten=True, svd_solver='randomized', random_state=42)
# initialize SVM with Radial Basis Functions (rbf) and balanced weights (although dataset is pretty balanced)
# parameters C and gamma are taken from previous analysis:
# {'svc__C': 10000.0, 'svc__gamma': 0.004}
sv_clf = SVC(kernel='rbf', class_weight='balanced', C=1e4, gamma=0.004)
model = make_pipeline(pca, sv_clf)

In [7]:
# cross-validation over train/test samples with PCA transformation into SVM classifier
%time cv_pca_50_svm = cross_val_score(model, X, y, cv=cv_splits)

Wall time: 29.8 s


In [8]:
cv_pca_50_svm

array([0.9089404 , 0.89403974, 0.92218543, 0.91059603, 0.88410596])

In [None]:
# pca_sv_clf = GridSearchCV(model, param_grid = sv_grid, cv = cv_splits)