In [37]:
# Importing the necessary libraries.
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from scipy.stats import uniform
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)

<h2 style = "font-size:28px;font-family:Calibri">
    Selecting Best Models using Exhaustive Search
</h2>

In [3]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [4]:
# Creating a Logistic Regression Model
logistic = linear_model.LogisticRegression(max_iter = 500, solver = 'liblinear')
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 10)
hyperparams = dict(C = C, penalty = penalty)
gridsearch = GridSearchCV(logistic, hyperparams, cv = 5, verbose = 0)
best_model = gridsearch.fit(features, target)
print(best_model.best_estimator_)

LogisticRegression(C=7.742636826811269, max_iter=500, penalty='l1',
                   solver='liblinear')


In [5]:
print(f"Best Penalty: {best_model.best_estimator_.get_params()['penalty']}")
print(f"Best C: {best_model.best_estimator_.get_params()['C']}")

Best Penalty: l1
Best C: 7.742636826811269


In [6]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

<h2 style = "font-size:28px;font-family:Calibri">
    Selecting Best Models using Randomized Search
</h2>

In [7]:
C = uniform(loc = 0, scale = 4)
hyperparams = dict(C = C, penalty = penalty)
randomizedsearch = RandomizedSearchCV(logistic, hyperparams, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
best_model = randomizedsearch.fit(features, target)
print(best_model.best_estimator_)

LogisticRegression(C=1.668088018810296, max_iter=500, penalty='l1',
                   solver='liblinear')


In [8]:

uniform(loc = 0, scale = 4).rvs(10)

array([1.75440605, 3.95349535, 0.40817924, 0.83550702, 0.64523807,
       2.6124333 , 1.01316641, 1.86524309, 0.97770237, 0.63587833])

In [9]:
print(f"Best Penalty: {best_model.best_estimator_.get_params()['penalty']}")
print(f"Best C: {best_model.best_estimator_.get_params()['C']}")

Best Penalty: l1
Best C: 1.668088018810296


In [10]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

The number of sampled combinations of hyperparameters (i.e., the number
 of candidate models trained) is specified with the n_iter (number of
 iterations) setting. It’s worth noting that *RandomizedSearchCV* isn’t
 inherently faster than *GridSearchCV*, but can often achieve comparable
 performance to GridSearchCV in less time just by testing by less
 combinations.

<h2 style = "font-size:28px;font-family:Calibri">
    Selecting Best Models from Multiple Learning Algorithms
</h2>

In [11]:
pipe = Pipeline([("classifier", RandomForestClassifier())])
search_space = [{
                    "classifier": [linear_model.LogisticRegression(max_iter = 500, solver = "liblinear")],
                    "classifier__penalty": ['l1', 'l2'],
                    "classifier__C": np.logspace(0, 4, 10)},
               {
                   "classifier": [RandomForestClassifier()],
                   "classifier__n_estimators": [10, 100, 1000],
                   "classifier__max_features": [1, 2, 3]
               }]

gridsearch = GridSearchCV(pipe, search_space, cv = 5, verbose = 0)
best_model = gridsearch.fit(features, target)
print(best_model.best_estimator_)

Pipeline(steps=[('classifier',
                 LogisticRegression(C=7.742636826811269, max_iter=500,
                                    penalty='l1', solver='liblinear'))])


In [12]:
print(best_model.best_estimator_.get_params()["classifier"])

LogisticRegression(C=7.742636826811269, max_iter=500, penalty='l1',
                   solver='liblinear')


In [13]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

<h2 style = "font-size:28px;font-family:Calibri">
    Selecting Best Models When Preprocessing
</h2>
FeatureUnion to combine two preprocessing steps: standardize the 
feature values (StandardScaler) and Principal Component Analysis (PCA). 
This object is called preprocess and contains both of our preprocessing steps.


In [14]:
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])
pipe = Pipeline([("preprocess", preprocess), ("classifier", linear_model.LogisticRegression(max_iter = 1000, solver = "liblinear"))])
search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                "classifier__penalty": ["l1", "l2"],
                "classifier__C": np.logspace(0, 4, 10)}]

clf = GridSearchCV(pipe, search_space, cv = 5, verbose = 0, n_jobs = -1)
best_model = clf.fit(features, target)
print(best_model.best_estimator_)

Pipeline(steps=[('preprocess',
                 FeatureUnion(transformer_list=[('std', StandardScaler()),
                                                ('pca', PCA(n_components=1))])),
                ('classifier',
                 LogisticRegression(C=7.742636826811269, max_iter=1000,
                                    penalty='l1', solver='liblinear'))])


In [17]:
print(best_model.best_estimator_.get_params()['preprocess__pca__n_components'])

1


<h2 style = "font-size:28px;font-family:Calibri">
    Speeding the Model Selection with Parallelization
</h2>

Use all the cores in your machine by setting n_jobs=-1 which trains
 multiple models simultaneously.

In [27]:
logistic = linear_model.LogisticRegression(max_iter = 500, solver = 'liblinear')
C = np.logspace(0, 4, 1000)
hyperparams = dict(C = C, penalty = penalty)

gridsearch = GridSearchCV(logistic, hyperparams, cv = 5, n_jobs = -1, verbose = 2)
best_model = gridsearch.fit(features, target)
print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=5.926151812475554, max_iter=500, penalty='l1',
                   solver='liblinear')


In [29]:
# n_jobs = 1 takes a hell lot of time since it is using only 1 core 
# to train the model while, n_jobs = -1 uses all the cores to train the model.
gridsearch = GridSearchCV(logistic, hyperparams, cv = 5, n_jobs = 1, verbose = 1)
best_model = gridsearch.fit(features, target)
print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=5.926151812475554, max_iter=500, penalty='l1',
                   solver='liblinear')


<h2 style = "font-size:28px;font-family:Calibri">
    Speeding Up Model Selection Using Algorithm-Specific Methods
</h2>

In [32]:
logit = linear_model.LogisticRegressionCV(Cs = 100, max_iter = 500, solver = "liblinear")
logit.fit(features, target)
print(logit)

LogisticRegressionCV(Cs=100, max_iter=500, solver='liblinear')


<h2 style = "font-size:28px;font-family:Calibri">
    Evaluating Performance After Model
 Selection
</h2>

In [38]:
C = np.logspace(0, 4, 20)
hyperparams = dict(C = C)
gridsearch = GridSearchCV(logistic, hyperparams, cv=5, n_jobs=-1, verbose=0)
cross_val_score(gridsearch, features, target).mean()

0.9733333333333334

In [39]:
best_model = gridsearch.fit(features, target)

In [41]:
cross_val_score(gridsearch, features, target)

array([1.        , 1.        , 0.93333333, 0.93333333, 1.        ])

<h2 style = "font-size:28px;font-family:Calibri">
    End of Day 9 :)
</h2>