In [109]:
from sklearn.model_selection import (cross_val_score, train_test_split, GridSearchCV)
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.metrics import (precision_recall_curve, get_scorer_names)

import numpy as np
import matplotlib.pyplot as plt

# ___Model Selection - Optimizing Classifiers for Different Evaluation Metrics___
--------------

In [1]:
# Model selection -> the process of selecting the best classifier for our application.

# 1) Using the same dataset for training and testing -> overfits badly and doesn't generalize well to new data.
# However, this can be a useful sanity check for the functionalities of the S.E and feature generation pipelines.

# 2) Using a single train-test split -> 
# This is fast and easy, however does not give information on how well the model may perform on new data with different compositions.

# 3) Using k-fold cross validation, where the evaluation metric is averaged across a set of train-test splits.
# Produces more reliable models.
# We can use grid search to search over a range of parameter values to figure out the best values for model parameters.

In [2]:
# The default evaluation metric used for cross_val_score or grid search cross-validation is accuracy (R2 score)
# However, we have the liberty to choose the metric of our choosing through the "scoring" kwarg

In [4]:
digits = load_digits()
# train_x, test_x, train_y, test_y = train_test_split(digits.data, digits.target, train_size = 0.8)
# train_test_split is not needed !

In [25]:
mask = np.logical_or(digits.target == 1, digits.target == 0)

In [26]:
x, y = digits.data[mask], digits.target[mask]

In [27]:
svClassifier = SVC(C = 0.5, kernel = "linear")

In [29]:
cross_val_score(estimator = svClassifier, X = x, y = y, cv = 10, scoring = "accuracy")

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [32]:
cross_val_score(estimator = svClassifier, X = x, y = y, cv = 10, scoring = "roc_auc")

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [33]:
cross_val_score(estimator = svClassifier, X = x, y = y, cv = 10, scoring = "recall")

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [34]:
# Above, we did not do any parameter tusing.
# We simply evaluated a predefined model's performance over different sets of train-test splits, using different evaluation metrics.

## ___Grid Search___
-------------

In [45]:
x, y = digits.data, digits.target

In [46]:
digits.data.shape, x.shape

((1797, 64), (1797, 64))

In [47]:
digits.target.shape, y.shape

((1797,), (1797,))

In [48]:
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size = 0.75)

In [50]:
svClassifier = SVC(kernel = "rbf")

In [70]:
params = {"gamma": np.logspace(0.001, 100, num = 10)}

In [71]:
params

{'gamma': array([1.00230524e+000, 1.29419584e+011, 1.67109061e+022, 2.15774441e+033,
        2.78612117e+044, 3.59749335e+055, 4.64515275e+066, 5.99791076e+077,
        7.74461798e+088, 1.00000000e+100])}

In [72]:
classifier_grid = GridSearchCV(estimator = svClassifier, param_grid = params)

In [74]:
classifier_grid.fit(train_x, train_y)

In [75]:
classifier_grid.decision_function(test_x)

array([[ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903],
       [ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903],
       [ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903],
       ...,
       [ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903],
       [ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903],
       [ 6.04883779,  5.03059309,  9.13080169, ...,  1.94775409,
        -0.18764523,  3.00966903]])

In [76]:
classifier_grid.best_estimator_

In [77]:
classifier_grid.best_params_

{'gamma': 1.0023052380778996}

In [78]:
classifier_grid.best_score_

0.1069062370921107

In [86]:
# There are many metrics supported by the "scoring" parameter.

get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [89]:
# A classifier's decision boundary changes when it is optimized for a specific evaluation metric.

In [91]:
train_x, test_x, train_y, test_y = train_test_split(digits.data, digits.target, train_size = 0.75)

In [96]:
train_x.shape

(1347, 64)

In [97]:
# all rows, 2 columns

train_x[:, [20, 59]]

array([[10., 16.],
       [16., 16.],
       [14.,  0.],
       ...,
       [14., 13.],
       [ 6., 16.],
       [16., 11.]])

In [98]:
JITTER = 0.125

x_train = train_x[:, [20, 59]] + np.random.rand(train_x.shape[0], 2) - JITTER
x_test = test_x[:, [20, 59]] + np.random.rand(test_x.shape[0], 2) - JITTER

In [101]:
svc = SVC(kernel = "linear")

In [102]:
gparams = {"class_weight": ["balanced", {1: 2}, {1: 3}, {1: 4}, {1: 5}, {1: 10}, {1: 20}, {1: 50}]}

## ___Precision - Recall Tradeoff___
------------------

In [111]:
train_x, test_x, train_y, test_y = train_test_split(digits.data, digits.target, train_size = 0.65)
JITTER = 0.125
x_train = train_x[:, [20, 59]] + np.random.rand(train_x.shape[0], 2) - JITTER
x_test = test_x[:, [20, 59]] + np.random.rand(test_x.shape[0], 2) - JITTER

In [113]:
svc = SVC(kernel = "linear", class_weight = "balanced").fit(x_train, train_y)

In [115]:
pred_probs = svc.decision_function(x_test)

In [116]:
precision_recall_curve(test_y, pred_probs)

ValueError: multiclass format is not supported

In [117]:
# Man fuck this shit!