In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import joblib

In [2]:
# we are looking here to load data, split test and train sets, do dimensionality reduction and ML

In [3]:
# Load dataset from npz files
dict_X = np.load('../dataset/X.npz')
dict_y = np.load('../dataset/y.npz')
# Store arrays from dictionary
X = dict_X['arr_0']
y = dict_y['arr_0']

In [4]:
# we will look first at training a model using PCA first and SVM with rbf
# data split training/test sets for cross-validations
cv_splits = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

In [5]:
# I would need a pipeline of PCA and SVM to do my parameter optimization/cross-validation

# initialize PCA with n_components
n_PCA = 10
pca = PCA(n_components=n_PCA, whiten=True, svd_solver='randomized', random_state=42)
# initialize SVM with Radial Basis Functions (rbf) and balanced weights (although dataset is pretty balanced)
# possible parameter to explore in the future: break_ties (at increased comp. cost)
sv_clf = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, sv_clf)

In [6]:
# Now I'm looking into parameter tuning for X_train_PCA,y_train using SVM classifier with RBF
# C_range = np.logspace(3, 6, 4)
C_range = np.linspace(1e4, 1e5, 4)
# gamma_range = np.logspace(-6, -3, 4)
gamma_range = np.linspace(1e-3, 1e-2, 4)
sv_grid = dict(svc__gamma = gamma_range, svc__C = C_range)

In [7]:
# cross-validation parameter optimization with PCA transformation into SVM classifier
pca_sv_clf = GridSearchCV(model, param_grid = sv_grid, cv = cv_splits)


In [8]:
# fit different X train splits and time
%time pca_sv_clf.fit(X, y)


Wall time: 6min


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.25,
            train_size=None),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=10, random_state=42,
                                            svd_solver='randomized', tol=0.0,
                                            whiten=True)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200,
                                            class_weight='balanced', coef...
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
               

In [9]:
# parameter testing results
pca_sv_clf.best_params_

{'svc__C': 10000.0, 'svc__gamma': 0.004}

In [10]:
pca_sv_clf.cv_results_

{'mean_fit_time': array([3.99077063, 3.87034569, 4.00058374, 3.58612094, 4.4649106 ,
        3.85004935, 3.70663748, 3.57896991, 4.94288049, 4.26689992,
        4.38804421, 4.20090299, 5.0964191 , 4.18946352, 4.13698249,
        3.97581635]),
 'std_fit_time': array([0.18088794, 0.20950016, 0.23976956, 0.14504039, 0.11226537,
        0.07496811, 0.10176069, 0.0923548 , 0.39641619, 0.22903394,
        0.59889044, 0.17673092, 0.27276113, 0.1394811 , 0.17512186,
        0.11836445]),
 'mean_score_time': array([0.27115655, 0.26296902, 0.30260506, 0.27431054, 0.25144677,
        0.25320306, 0.2743331 , 0.25467515, 0.28529801, 0.28654294,
        0.32861891, 0.29699383, 0.28668938, 0.27152362, 0.30453453,
        0.26519794]),
 'std_score_time': array([0.02366311, 0.0150726 , 0.06036086, 0.03297585, 0.00324421,
        0.00607991, 0.04180485, 0.00476809, 0.05788874, 0.04189429,
        0.07311265, 0.03256366, 0.02350129, 0.02067299, 0.03891422,
        0.0088195 ]),
 'param_svc__C': masked_ar

In [11]:
idxRank = pca_sv_clf.cv_results_['rank_test_score']
for index, item in enumerate(pca_sv_clf.cv_results_['params']):
    print(f"{item} - {idxRank[index]}")

{'svc__C': 10000.0, 'svc__gamma': 0.001} - 16
{'svc__C': 10000.0, 'svc__gamma': 0.004} - 1
{'svc__C': 10000.0, 'svc__gamma': 0.007000000000000001} - 2
{'svc__C': 10000.0, 'svc__gamma': 0.01} - 6
{'svc__C': 40000.0, 'svc__gamma': 0.001} - 4
{'svc__C': 40000.0, 'svc__gamma': 0.004} - 5
{'svc__C': 40000.0, 'svc__gamma': 0.007000000000000001} - 7
{'svc__C': 40000.0, 'svc__gamma': 0.01} - 9
{'svc__C': 70000.0, 'svc__gamma': 0.001} - 3
{'svc__C': 70000.0, 'svc__gamma': 0.004} - 7
{'svc__C': 70000.0, 'svc__gamma': 0.007000000000000001} - 12
{'svc__C': 70000.0, 'svc__gamma': 0.01} - 11
{'svc__C': 100000.0, 'svc__gamma': 0.001} - 10
{'svc__C': 100000.0, 'svc__gamma': 0.004} - 15
{'svc__C': 100000.0, 'svc__gamma': 0.007000000000000001} - 12
{'svc__C': 100000.0, 'svc__gamma': 0.01} - 14


In [None]:
# Here perform one training over samples using more PCA modes


In [12]:
# save the model to disk
# filename = f"../dataset/grid_{n_PCA}_16.sav"
# joblib.dump(pca_sv_clf, filename)

In [13]:
# clf = clf.fit(X_train_pca, y_train)
# X_t_t = pca.transform(X_train)

In [14]:
# C_range
# C_range = np.linspace(1e4, 1e5, 4)
# print(C_range)

In [15]:
# gamma_range
# gamma_range = np.linspace(1e-3, 1e-2, 4)
# print(gamma_range)