# tutorial 4: in-pipeline hyperparameter screens

This tutorial show you how to use the **GridSearchEnsemble** class to 

* screen ML model hyperparameters during model fitting
* make ensemble predictions using the results of a hyperparameter screen

## Stacked generalization with parameter selection

In this example, internal cv is used to estimate the quality of a set of hyperameters as well as to generate features for meta-prediction with a support vector machine.  The top two parameter sets are chosen to create the final model.

In [3]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import pipecaster as pc

screen_specs = {
     'learning_rate':[0.1, 10],
     'n_estimators':[2, 10],
}

X, y = make_classification()
clf = pc.GridSearchEnsemble(
                 param_dict=screen_specs,
                 base_predictor_cls=GradientBoostingClassifier,
                 meta_predictor=SVC(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=2),
                 base_processes='max')
clf.fit(X, y)
clf.get_screen_results()

Unnamed: 0_level_0,selections,score
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1
"{'learning_rate': 10, 'n_estimators': 10}",+,0.84974
"{'learning_rate': 0.1, 'n_estimators': 2}",-,0.839536
"{'learning_rate': 10, 'n_estimators': 2}",+,0.839536
"{'learning_rate': 0.1, 'n_estimators': 10}",-,0.829332


In [3]:
cross_val_score(clf, X, y, scoring='balanced_accuracy', cv=3)

array([0.79861111, 0.85294118, 0.81801471])

## Parameter selection (without ensemble prediction)

In this example, the meta-predictor is dropped and the best parameter set is used to make the final model.

In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import pipecaster as pc

screen_specs = {
     'learning_rate':[0.1, 10],
     'n_estimators':[2, 10],
}

X, y = make_classification()
clf = pc.GridSearchEnsemble(
                 param_dict=screen_specs,
                 base_predictor_cls=GradientBoostingClassifier,
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=1),
                 base_processes='max')
clf.fit(X, y)
clf.get_screen_results()

File descriptor limit 256 is too low for production servers and may result in connection errors. At least 8192 is recommended. --- Fix with 'ulimit -n 8192'
2021-03-01 21:37:19,596	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


Unnamed: 0_level_0,selections,score
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1
"{'learning_rate': 0.1, 'n_estimators': 2}",+,0.89
"{'learning_rate': 10, 'n_estimators': 10}",-,0.88
"{'learning_rate': 10, 'n_estimators': 2}",-,0.87
"{'learning_rate': 0.1, 'n_estimators': 10}",-,0.86


In [2]:
cross_val_score(clf, X, y, scoring='balanced_accuracy', cv=3)

array([0.97058824, 0.97058824, 0.84375   ])

### Screening screens: the double stack

In [27]:
### (0) FAILED
screen_specs = {
     'learning_rate':[.1, 1],
     'n_estimators':[5, 10]
}

screen_clf = pc.GridSearchEnsemble(
                 screen_specs, GradientBoostingClassifier, SVC(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=2))

### (1) PASSED
predictors = [LogisticRegression(),
              KNeighborsClassifier(),  GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

ensemble_clf = pc.Ensemble(
                 base_predictors=predictors,
                 meta_predictor=None,
                 internal_cv=5,
                 scorer='auto',
                 base_processes='max')

### (2) FAILED, wrong inputs selected, all channels cv scores excactly 0.5
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

ensemble_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=SVC(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

### (3) MIXED: correct inputs selected, but all cv scores were 
#              identical for the different base predictors

base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

ensemble_clf = pc.Ensemble(
                 base_predictors=predictors, meta_predictor=pc.SoftVotingClassifier(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

In [1]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
import pipecaster as pc

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import pipecaster as pc

Xs, y, X_types = pc.make_multi_input_classification(n_informative_Xs=3,
                                                    n_random_Xs=7, class_sep=10)

predictors = [LogisticRegression(),
              KNeighborsClassifier(),  GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

ensemble_clf = pc.Ensemble(
                 base_predictors=predictors, meta_predictor=SVC(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

clf = pc.MultichannelPipeline(n_channels=10)
clf.add_layer(pc.ChannelEnsemble(ensemble_clf, SVC(), internal_cv=5, scorer='auto', 
                                 score_selector=pc.RankScoreSelector(k=3)),
              pipe_processes='max')

pc.cross_val_score(clf, Xs, y, cv=3)

[0.5, 0.39889705882352944, 0.6691176470588236]

In [15]:
clf.fit(Xs, y)

Unnamed: 0_level_0,layer_0,out_0
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ChannelEnsemble,→
1,▽,
2,▽,
3,▽,
4,▽,
5,▽,
6,▽,
7,▽,
8,▽,
9,▽,


In [16]:
channel_ensemble = clf.get_model(0, 0)
df = channel_ensemble.get_screen_results()
df['inputs'] = X_types
df

Unnamed: 0_level_0,performance,selections,inputs
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.5,-,random
1,0.5,-,informative
2,0.5,-,random
3,0.5,-,informative
4,0.5,-,informative
5,0.5,-,random
6,0.5,-,random
7,0.5,+++,random
8,0.5,+++,random
9,0.5,+++,random


In [17]:
channel_ensemble.scores_

(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5)

In [18]:
ensemble_clfs = clf.get_model(0, 0).base_models
ensemble_clfs = [pc.transform_wrappers.unwrap_model(m) for m in ensemble_clfs]

for i, ensemble_clf in enumerate(ensemble_clfs):
    print('\nchannel {}, input = {}'.format(i, X_types[i]))
    if ensemble_clf is None:
        print('(not selected)')
        continue
    else:
        print(ensemble_clf.scores_)
        print(ensemble_clf.get_screen_results())


channel 0, input = random
(not selected)

channel 1, input = informative
(not selected)

channel 2, input = random
(not selected)

channel 3, input = informative
(not selected)

channel 4, input = informative
(not selected)

channel 5, input = random
(not selected)

channel 6, input = random
(not selected)

channel 7, input = random
(0.56, 0.45, 0.44, 0.44, 0.46)
                              performance selections
model                                               
LogisticRegression()                 0.56        +++
KNeighborsClassifier()               0.45        +++
GradientBoostingClassifier()         0.44          -
RandomForestClassifier()             0.44          -
GaussianNB()                         0.46        +++

channel 8, input = random
(0.54, 0.56, 0.51, 0.51, 0.56)
                              performance selections
model                                               
LogisticRegression()                 0.54        +++
KNeighborsClassifier()               0.56    

In [19]:
import toolbelt
toolbelt.play_rick1()