# tutorial 4: in-pipeline hyperparameter screens

This tutorial show you how to use the **GridSearchEnsemble** class to 

* screen ML model hyperparameters during model fitting
* make ensemble predictions using the results of a hyperparameter screen

## Stacked generalization with parameter selection

In this example, internal cv is used to estimate the quality of a set of hyperameters as well as to generate features for meta-prediction with a support vector machine.  The top two parameter sets are chosen to create the final model.

In [3]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import pipecaster as pc

screen_specs = {
     'learning_rate':[0.1, 10],
     'n_estimators':[2, 10],
}

X, y = make_classification()
clf = pc.GridSearchEnsemble(
                 param_dict=screen_specs,
                 base_predictor_cls=GradientBoostingClassifier,
                 meta_predictor=SVC(),
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=2),
                 base_processes='max')
clf.fit(X, y)
clf.get_screen_results()

Unnamed: 0_level_0,selections,score
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1
"{'learning_rate': 10, 'n_estimators': 10}",+,0.84974
"{'learning_rate': 0.1, 'n_estimators': 2}",-,0.839536
"{'learning_rate': 10, 'n_estimators': 2}",+,0.839536
"{'learning_rate': 0.1, 'n_estimators': 10}",-,0.829332


In [3]:
cross_val_score(clf, X, y, scoring='balanced_accuracy', cv=3)

array([0.79861111, 0.85294118, 0.81801471])

## Parameter selection (without ensemble prediction)

In this example, the meta-predictor is dropped and the best parameter set is used to make the final model.

In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import pipecaster as pc

screen_specs = {
     'learning_rate':[0.1, 10],
     'n_estimators':[2, 10],
}

X, y = make_classification()
clf = pc.GridSearchEnsemble(
                 param_dict=screen_specs,
                 base_predictor_cls=GradientBoostingClassifier,
                 internal_cv=5, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=1),
                 base_processes='max')
clf.fit(X, y)
clf.get_screen_results()

File descriptor limit 256 is too low for production servers and may result in connection errors. At least 8192 is recommended. --- Fix with 'ulimit -n 8192'
2021-03-01 21:37:19,596	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


Unnamed: 0_level_0,selections,score
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1
"{'learning_rate': 0.1, 'n_estimators': 2}",+,0.89
"{'learning_rate': 10, 'n_estimators': 10}",-,0.88
"{'learning_rate': 10, 'n_estimators': 2}",-,0.87
"{'learning_rate': 0.1, 'n_estimators': 10}",-,0.86


In [2]:
cross_val_score(clf, X, y, scoring='balanced_accuracy', cv=3)

array([0.97058824, 0.97058824, 0.84375   ])

### Screening screens: the double stack

In [None]:
# (0) FAILED: channel selection failed, channel ensemble scores all 0.5, 
# ensemble scores mixed and track well with channel information content.

screen_specs = {
     'learning_rate':[.1, 1],
     'n_estimators':[5, 10]
}

channel_clf = pc.GridSearchEnsemble(
                 screen_specs, GradientBoostingClassifier, SVC(),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=2))

# (1) PASSED, correct channel selected, channel ensemble scores track with information, 
# ensemble scores track with information content and are heterogeneous
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(),  GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors,
                 meta_predictor=None,
                 internal_cv=3,
                 scorer='auto',
                 base_processes=1)

### (2) PASS
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=pc.SoftVotingClassifier(),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

# (3) FAILED: wrong channels selected, channel ensemble scores all 0.5,
# ensemble scores heterogeneous and track well with input type
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=SVC(),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

# (4) PASS
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=GradientBoostingClassifier(),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

# (5) PASS
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=SVC(probability=True),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=5))

In [1]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pipecaster as pc

In [2]:
# (3) FAILED: wrong channels selected, channel ensemble scores all 0.5,
# ensemble scores heterogeneous and track well with input type
base_predictors = [LogisticRegression(),
              KNeighborsClassifier(), GradientBoostingClassifier(),
              RandomForestClassifier(), GaussianNB()]

channel_clf = pc.Ensemble(
                 base_predictors=base_predictors, meta_predictor=LogisticRegression(),
                 internal_cv=3, scorer='auto',
                 score_selector=pc.RankScoreSelector(k=3))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import pipecaster as pc

Xs, y, X_types = pc.make_multi_input_classification(n_informative_Xs=3,
                                                    n_random_Xs=7, class_sep=1)

clf = pc.MultichannelPipeline(n_channels=10)
clf.add_layer(pc.ChannelEnsemble(channel_clf, SVC(), internal_cv=5, scorer='auto', 
                                 score_selector=pc.RankScoreSelector(k=3)),
              pipe_processes='max')

pc.cross_val_score(clf, Xs, y, cv=3)

In [None]:
clf.fit(Xs, y)

In [5]:
channel_ensemble = clf.get_model(0, 0)
df = channel_ensemble.get_screen_results()
df['inputs'] = X_types
df

Unnamed: 0_level_0,performance,selections,inputs
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.5,-,random
1,0.5,-,informative
2,0.5,-,informative
3,0.5,-,random
4,0.5,-,informative
5,0.5,-,random
6,0.5,-,random
7,0.5,+++,random
8,0.5,+++,random
9,0.5,+++,random


In [6]:
channel_ensemble.scores_

[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

In [7]:
channel_clfs = clf.get_model(0, 0).base_models
channel_clfs = [pc.transform_wrappers.unwrap_model(m) for m in channel_clfs]

for i, channel_clf in enumerate(channel_clfs):
    print('\nchannel {}, input = {}'.format(i, X_types[i]))
    if channel_clf is None:
        print('(not selected)')
        continue
    else:
        print(channel_clf.scores_)
        print(channel_clf.get_screen_results())


channel 0, input = random
(not selected)

channel 1, input = informative
(not selected)

channel 2, input = informative
(not selected)

channel 3, input = random
(not selected)

channel 4, input = informative
(not selected)

channel 5, input = random
(not selected)

channel 6, input = random
(not selected)

channel 7, input = random
[0.5, 0.53, 0.5, 0.47, 0.43000000000000005]
                              performance selections
model                                               
LogisticRegression()                 0.50        +++
KNeighborsClassifier()               0.53        +++
GradientBoostingClassifier()         0.50        +++
RandomForestClassifier()             0.47          -
GaussianNB()                         0.43          -

channel 8, input = random
[0.5, 0.52, 0.5, 0.5, 0.5]
                              performance selections
model                                               
LogisticRegression()                 0.50          -
KNeighborsClassifier()              

In [8]:
import toolbelt
toolbelt.play_rick1()

In [41]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, explained_variance_score
import pipecaster as pc
import numpy as np

X, y = make_regression(n_samples=500, n_features=20, n_informative=3)

clf1 = LinearRegression()
clf2 = GradientBoostingRegressor(n_estimators=10)

In [42]:
np.mean(cross_val_score(clf1, X, y, scoring='explained_variance'))

1.0

In [43]:
np.mean(cross_val_score(clf2, X, y, scoring='explained_variance'))

0.6934843476701928

In [44]:
np.mean(pc.cross_val_score(clf1, X, y, 
                           predict_methods='predict', scorers=explained_variance_score))

1.0

In [45]:
np.mean(pc.cross_val_score(clf2, X, y, 
                           predict_methods='predict', scorers=explained_variance_score))

0.6971880677324713

In [31]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
import pipecaster as pc
X, y = make_classification(n_features=3, n_informative=3, n_redundant=0)


base_clf = GradientBoostingClassifier()
base_clfs = [base_clf, base_clf, base_clf]

clf = pc.Ensemble(base_clfs, pc.SoftVotingClassifier(), internal_cv=3, 
                  score_selector=pc.RankScoreSelector(k=2))

cross_val_score(base_clf, X, y, scoring='balanced_accuracy', cv=3)

AttributeError: module 'pipecaster' has no attribute 'Ensemble'

In [None]:
pc.cross_val_score(clf, X, y)

In [8]:
np.random.rand(4,2)

array([[0.59453016, 0.23542929],
       [0.2211852 , 0.43608604],
       [0.71609175, 0.72278445],
       [0.73470326, 0.20535604]])

In [10]:
roc_auc_score(['a','b','a','c'], np.random.rand(4,2))

ValueError: multi_class must be in ('ovo', 'ovr')

In [14]:
c = SVC().fit(X,y)
pred_1 = c.decision_function(X)

In [16]:
c = SVC(probability=True).fit(X,y)
pred_2 = c.predict_proba(X)

In [20]:
len(pred_1.shape)

1

In [17]:
import pandas as pd
pd.DataFrame(pred_1).head()

Unnamed: 0,0
0,1.34246
1,0.274603
2,-0.576541
3,-0.999637
4,-0.456869


In [18]:
pd.DataFrame(pred_2).head()

Unnamed: 0,0,1
0,0.07076,0.92924
1,0.362954,0.637046
2,0.743174,0.256826
3,0.865783,0.134217
4,0.697578,0.302422
