###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

### Model selection of Elastic Nets
Hyperparameter search for an Elastic Net classifier using [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) and [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [1]:
import scipy
import scipy.io
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import utils

In [2]:
# Loads dataset
dataset = scipy.io.loadmat('E:/datasets/classification/ionosphere.mat')

In [3]:
# Prints dataset description
dataset['Description']

array(['Ionosphere dataset from the UCI machine learning repository:                   ',
       'http://archive.ics.uci.edu/ml/datasets/Ionosphere                              ',
       'X is a 351x34 real-valued matrix of predictors. Y is a categorical response:   ',
       '"b" for bad radar returns and "g" for good radar returns.                      ',
       'This is a binary classification problem.                                       '],
      dtype='<U79')

In [4]:
# Prepares data for classification
predictors = dataset['X'][:,2:] # the first two columns are useless
positive = (dataset['Y'] == 'g').flatten()
responses = np.zeros(predictors.shape[0])
responses[positive] = 1

In [5]:
# Splits dataset into train/test
X, X_holdout, y, y_holdout = train_test_split(predictors, responses, test_size=0.3, random_state=0)

In [6]:
# Creates pipeline
clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=300)
sc = StandardScaler()
estimators = [('normalizer', sc), ('classifier', clf)]
pipe = Pipeline(estimators)
pipe

Pipeline(memory=None,
         steps=[('normalizer',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=300,
                                    multi_class='auto', n_jobs=None,
                                    penalty='elasticnet', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

### Experiment 1
Performs model selection of the following hyperparameters applied to the bank dataset (customers leaving):
* C (Regularization)
* L1/L2 Ratio

This is perform using a [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).

In [7]:
# Defines grid
hyperparams = [{
    'classifier__C': np.linspace(0.001, 1, 25),
    'classifier__l1_ratio': np.linspace(0, 0.9, 10)
}]

In [8]:
# Performs grid search
validator = GridSearchCV(pipe, cv=5, param_grid=hyperparams, scoring='accuracy', n_jobs=-1, verbose=1)
results = validator.fit(X,y)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1025 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed:    4.3s finished


In [9]:
# Report results
utils.report_search(validator.cv_results_)


Model with rank: 1
Mean validation score: 0.878 (std: 0.029)
Parameters: {'classifier__C': 0.209125, 'classifier__l1_ratio': 0.0}

Model with rank: 2
Mean validation score: 0.873 (std: 0.033)
Parameters: {'classifier__C': 0.292375, 'classifier__l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.873 (std: 0.030)
Parameters: {'classifier__C': 0.1675, 'classifier__l1_ratio': 0.0}


In [10]:
# Selects best configuration after search
best = validator.best_estimator_

In [11]:
# Retrains the model now on the available dataset (without validation/dev set). This step is optional!!!
best.fit(X, y)

Pipeline(memory=None,
         steps=[('normalizer',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 LogisticRegression(C=0.209125, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=0.0, max_iter=300,
                                    multi_class='auto', n_jobs=None,
                                    penalty='elasticnet', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [12]:
# Training performance
y_pred = best.predict(X)
utils.report_classification(y, y_pred, avg='macro', title='Train')

Train (Metrics): 

Accuracy: 0.91
F1 Score: 0.90
Recall: 0.89
Precision: 0.92

Confusion Matrix:
 [[ 66  16]
 [  5 158]]


In [13]:
# Holdout performance
y_pred = best.predict(X_holdout)
utils.report_classification(y_holdout, y_pred, avg='macro', title='Holdout')

Holdout (Metrics): 

Accuracy: 0.86
F1 Score: 0.85
Recall: 0.83
Precision: 0.89

Confusion Matrix:
 [[30 14]
 [ 1 61]]


### Experiment 2
Performs model selection of the following hyperparameters applied to the bank dataset (customers leaving):
* C (Regularization)
* L1/L2 Ratio

This is perform using a [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html).

In [14]:
# Defines parameters distributions
hyperparams_dist = [{
    'classifier__C': scipy.stats.distributions.uniform(loc=0, scale=0.99), # uniform [0, 0.99]
    'classifier__l1_ratio': scipy.stats.distributions.uniform(), # uniform [0, 1]
}]

In [15]:
# Performs randomized search
validator = RandomizedSearchCV(pipe, cv=5, param_distributions=hyperparams,
                               scoring='accuracy', n_jobs=-1, verbose=1, n_iter=20)
results = validator.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.0s


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [16]:
# Report results
utils.report_search(validator.cv_results_)


Model with rank: 1
Mean validation score: 0.873 (std: 0.030)
Parameters: {'classifier__l1_ratio': 0.0, 'classifier__C': 0.1675}

Model with rank: 2
Mean validation score: 0.869 (std: 0.036)
Parameters: {'classifier__l1_ratio': 0.0, 'classifier__C': 0.12587500000000001}

Model with rank: 3
Mean validation score: 0.865 (std: 0.036)
Parameters: {'classifier__l1_ratio': 0.1, 'classifier__C': 0.875125}


In [17]:
# Selects best configuration after search
best = validator.best_estimator_

In [18]:
# Retrains the model now on the available dataset (without validation/dev set). This step is optional!!!
best.fit(X, y)

Pipeline(memory=None,
         steps=[('normalizer',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 LogisticRegression(C=0.1675, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=0.0, max_iter=300,
                                    multi_class='auto', n_jobs=None,
                                    penalty='elasticnet', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [19]:
# Training performance
y_pred = best.predict(X)
utils.report_classification(y, y_pred, avg='macro', title='Train')

Train (Metrics): 

Accuracy: 0.91
F1 Score: 0.89
Recall: 0.88
Precision: 0.91

Confusion Matrix:
 [[ 65  17]
 [  6 157]]


In [20]:
# Holdout performance
y_pred = best.predict(X_holdout)
utils.report_classification(y_holdout, y_pred, avg='macro', title='Holdout')

Holdout (Metrics): 

Accuracy: 0.86
F1 Score: 0.85
Recall: 0.83
Precision: 0.89

Confusion Matrix:
 [[30 14]
 [ 1 61]]
