<a href="https://colab.research.google.com/github/bmreiniger/datascience.stackexchange/blob/master/SO63974211.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-learn==0.23



In [2]:
from sklearn import config_context
from sklearn.ensemble import StackingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils._estimator_html_repr import estimator_html_repr


In [3]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [4]:
print(X.shape)
X.head()

(569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Pass the extra features off as predictions

In [5]:
from sklearn.base import ClassifierMixin, TransformerMixin
from sklearn.pipeline import Pipeline
class IdentityPassthrough(ClassifierMixin):
    def __init__(self):
        pass
    def fit(self, X, y):
        return self
    def predict(self, X):
        return X

partial_passthrough = Pipeline([
    ('pass', ColumnTransformer([('pass', 'passthrough', ['mean perimeter', 'mean area'])])),
    ('ident', IdentityPassthrough()),
])
base_features = ColumnTransformer([('pass', 'passthrough', ['mean radius', 'mean texture'])])

model = StackingClassifier(estimators=[
        ('pass', partial_passthrough),
        ('tree', Pipeline([('select', base_features), ('tree', DecisionTreeClassifier(random_state=42))])),
        ('knn', Pipeline([('select', base_features), ('knn', KNeighborsClassifier())])),
    ])

model.fit(X, y)

StackingClassifier(estimators=[('pass',
                                Pipeline(steps=[('pass',
                                                 ColumnTransformer(transformers=[('pass',
                                                                                  'passthrough',
                                                                                  ['mean '
                                                                                   'perimeter',
                                                                                   'mean '
                                                                                   'area'])])),
                                                ('ident',
                                                 <__main__.IdentityPassthrough object at 0x7faf357e3438>)])),
                               ('tree',
                                Pipeline(steps=[('select',
                                                 ColumnTransformer(transfor

In [6]:
with config_context(display='diagram'):
    display(model)

In [7]:
# To paste the diagram as html:
print(estimator_html_repr(model).replace('\n', ' ').replace('  ', ' '))

<style>div.sk-top-container {color: black;background-color: white;}div.sk-toggleable {background-color: white;}label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.2em 0.3em;box-sizing: border-box;text-align: center;}div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}div.

In [8]:
# Check that this model matches the next one
model.score(X, y)

0.9156414762741653

# Selection at the meta-estimator

This approach is finicky because you have to get the indexing right.  The final estimator's select takes columns 0 and 1, which are the base models' predictions (but if this weren't binary classification or some base estimator wasn't calling predict_proba, these would be different), and then features 4 and 5 are the desired additional features, shifted to the right two because of the base models' predictions.

In [9]:
from sklearn.base import ClassifierMixin, TransformerMixin
from sklearn.pipeline import Pipeline

base_features = ColumnTransformer([('pass', 'passthrough', ['mean radius', 'mean texture'])])

model = StackingClassifier(
    estimators=[
        ('tree', Pipeline([('select', base_features), ('tree', DecisionTreeClassifier(random_state=42))])),
        ('knn', Pipeline([('select', base_features), ('knn', KNeighborsClassifier())])),
    ],
    final_estimator=Pipeline([
        ('select', ColumnTransformer([('select', 'passthrough', [0, 1, 4, 5])])),
        ('model', LogisticRegression())
    ]),
    passthrough=True,
)

model.fit(X, y)

StackingClassifier(estimators=[('tree',
                                Pipeline(steps=[('select',
                                                 ColumnTransformer(transformers=[('pass',
                                                                                  'passthrough',
                                                                                  ['mean '
                                                                                   'radius',
                                                                                   'mean '
                                                                                   'texture'])])),
                                                ('tree',
                                                 DecisionTreeClassifier(random_state=42))])),
                               ('knn',
                                Pipeline(steps=[('select',
                                                 ColumnTransformer(transformers=[('pass',
 

In [10]:
with config_context(display='diagram'):
    display(model)

In [11]:
print(estimator_html_repr(model).replace('\n', ' ').replace('  ', ' '))

<style>div.sk-top-container {color: black;background-color: white;}div.sk-toggleable {background-color: white;}label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.2em 0.3em;box-sizing: border-box;text-align: center;}div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}div.

In [12]:
# Check that this matches the previous score
model.score(X, y)

0.9156414762741653

In [13]:
base_out = model.transform(X)
base_out.shape

(569, 32)