In [111]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [112]:
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

In [113]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [114]:
"Features: {}, Targets: {}".format(iris.feature_names, iris.target_names)

"Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], Targets: ['setosa' 'versicolor' 'virginica']"

In [115]:
X_test.shape

(38, 4)

## First we try a model with Scaler -> SVC

In [116]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [117]:
y_pred = model.predict(X_test)
"accuracy: {}, l1: {}, l2: {}".format(accuracy_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_error(y_test, y_pred))

'accuracy: 0.9473684210526315, l1: 0.05263157894736842, l2: 0.05263157894736842'

## Now we try a model with just SVC

In [118]:
model_no_scale = SVC(gamma='auto')
model_no_scale.fit(X_train, y_train)

SVC(gamma='auto')

In [119]:
y_pred_no_scale = model_no_scale.predict(X_test)
"accuracy: {}, l1: {}, l2: {}".format(accuracy_score(y_test, y_pred_no_scale), mean_absolute_error(y_test, y_pred_no_scale), mean_absolute_error(y_test, y_pred_no_scale))

'accuracy: 0.9736842105263158, l1: 0.02631578947368421, l2: 0.02631578947368421'

## Now we try a model with PCA  first

In [120]:
model_with_pca = make_pipeline(StandardScaler(), PCA(n_components=2), SVC(gamma='auto'))
model_with_pca.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)), ('svc', SVC(gamma='auto'))])

In [121]:
y_pred_with_pca = model_with_pca.predict(X_test)
"accuracy: {}, l1: {}, l2: {}".format(accuracy_score(y_test, y_pred_with_pca), mean_absolute_error(y_test, y_pred_with_pca), mean_absolute_error(y_test, y_pred_with_pca))

'accuracy: 0.868421052631579, l1: 0.13157894736842105, l2: 0.13157894736842105'

## Now we use grid search to pick best pca component number

In [122]:
model_with_pca_grid = make_pipeline(StandardScaler(), PCA(), SVC(gamma='auto'))
cross_val = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['pca__n_components'] = [1,2,3]
search = GridSearchCV(model_with_pca_grid, grid, scoring='neg_mean_absolute_error', cv=cross_val, n_jobs=1)
model_with_pca_grid.get_params().keys() # Prints the params to optimise over

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'pca', 'svc', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'svc__C', 'svc__break_ties', 'svc__cache_size', 'svc__class_weight', 'svc__coef0', 'svc__decision_function_shape', 'svc__degree', 'svc__gamma', 'svc__kernel', 'svc__max_iter', 'svc__probability', 'svc__random_state', 'svc__shrinking', 'svc__tol', 'svc__verbose'])

In [123]:
search.fit(iris.data, iris.target)

GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('svc', SVC(gamma='auto'))]),
             n_jobs=1, param_grid={'pca__n_components': [1, 2, 3]},
             scoring='neg_mean_absolute_error')

In [124]:
search.best_params_

{'pca__n_components': 3}

## Now we retry our PCA with 3 components instead of 2

In [125]:
model_with_pca_3 = make_pipeline(StandardScaler(), PCA(n_components=3), SVC(gamma='auto'))
model_with_pca_3.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=3)), ('svc', SVC(gamma='auto'))])

In [129]:
y_pred_with_pca_3 = model_with_pca_3.predict(X_test)
"accuracy: {}, l1: {}, l2: {}".format(accuracy_score(y_test, y_pred_with_pca_3), mean_absolute_error(y_test, y_pred_with_pca_3), mean_absolute_error(y_test, y_pred_with_pca_3))

'accuracy: 0.9473684210526315, l1: 0.05263157894736842, l2: 0.05263157894736842'