# An Example of the Pipeline Functionality:

Within this package, we build on the sklearn pipelines to improve their flexibility and use-cases. The main contribution is the modification of sklearn pipelines to fit, transform and predict on data dictionaries.

In [1]:
import sku

import numpy as np
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

We will download and use a toy-data set available in sklearn:

In [2]:
data = load_digits()

X = data['data']
y = data['target']

Let's make 75% of the data unlabelled, to emphasise the flexibility of this new pipeline. Here, we will class an unlabelled point as having a y value of $-1$.

In [3]:
idx_unlabelled = np.random.choice(y.shape[0], size=int(y.shape[0]*0.75), replace=False)

y[idx_unlabelled] = -1

In [4]:
print(f'There are {y[y!=-1].shape[0]} labelled points '\
        f'and {y[y==-1].shape[0]} unlabelled points.')

There are 450 labelled points and 1347 unlabelled points.


We now make a dictionary containing the data:

In [5]:
train_data = {'X':X[y!=-1], 'y': y[y!=-1], 'X_unlabelled': X[y==-1]}

In [6]:
train_data

{'X': array([[ 0.,  0.,  0., ..., 16.,  9.,  0.],
        [ 0.,  0., 12., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 11.,  3.,  0.],
        ...,
        [ 0.,  0.,  0., ...,  9.,  0.,  0.],
        [ 0.,  0.,  5., ...,  3.,  0.,  0.],
        [ 0.,  0.,  6., ...,  6.,  0.,  0.]]),
 'y': array([2, 5, 6, 8, 2, 6, 1, 7, 3, 1, 0, 2, 3, 5, 9, 3, 9, 8, 4, 4, 2, 2,
        5, 7, 9, 5, 8, 4, 5, 9, 8, 2, 8, 9, 0, 1, 1, 0, 8, 2, 2, 4, 5, 0,
        1, 1, 6, 8, 5, 4, 4, 4, 9, 8, 1, 3, 6, 9, 2, 4, 6, 8, 3, 0, 0, 1,
        7, 3, 4, 9, 5, 0, 1, 3, 1, 9, 1, 0, 3, 5, 4, 2, 8, 4, 5, 0, 1, 3,
        1, 2, 4, 7, 8, 9, 9, 5, 6, 5, 9, 7, 7, 1, 2, 7, 3, 6, 6, 2, 4, 6,
        3, 3, 7, 4, 6, 1, 5, 4, 8, 7, 8, 9, 7, 3, 5, 5, 6, 8, 9, 8, 4, 1,
        5, 2, 1, 3, 4, 4, 2, 2, 0, 6, 1, 8, 9, 2, 9, 8, 9, 8, 4, 5, 4, 8,
        9, 1, 5, 8, 0, 9, 8, 7, 3, 1, 2, 2, 1, 5, 9, 1, 3, 8, 3, 6, 5, 7,
        8, 2, 1, 3, 8, 9, 1, 3, 4, 6, 7, 8, 6, 8, 3, 0, 2, 7, 2, 3, 4, 6,
        4, 1, 6, 3, 7, 1, 6, 1, 3, 7, 2

Pipeline time!

We first define the possible transformers and predictors that we will be using, and wrap them using either the transformer or model wrapper.

In [7]:
pipeline_objects = {

    # fitting the standard scaler on the unlabelled data
    # and transforming the labelled and unlabelled data
    'standard_scaler': sku.SKTransformerWrapperDD(
                            StandardScaler,
                            fit_on=['X_unlabelled'],
                            transform_on=[['X'], ['X_unlabelled']],
                            # any keyword arguments passed here
                            # will be given to the StandardScaler object
                            ),

    # fitting the gbt on the labelled data only
    'gbt': sku.SKModelWrapperDD(
                            HistGradientBoostingClassifier,
                            fit_on=['X', 'y'],
                            predict_on=['X'],
                            # any keyword arguments passed here
                            # will be given to the 
                            # HistGradientBoostingClassifier object
                            )

    }

The wrappers above allow you to wrap any sklearn transformer or model in a way that allows it to accept a data dictionary in its `.fit()` method.

Now we can define a pipeline:

In [8]:
pipeline_name = 'standard_scaler--gbt'

And use the `pipeline_constructor` to build our pipeline:

In [9]:
pipeline = sku.pipeline_constructor(pipeline_name, pipeline_objects)

This looks like:

In [10]:
pipeline

It is now as easy as sklearn to fit and predict using this pipeline:

In [11]:
pipeline.fit(X = train_data)

In [12]:
pipeline.predict(X = train_data)

array([2, 5, 6, 8, 2, 6, 1, 7, 3, 1, 0, 2, 3, 5, 9, 3, 9, 8, 4, 4, 2, 2,
       5, 7, 9, 5, 8, 4, 5, 9, 8, 2, 8, 9, 0, 1, 1, 0, 8, 2, 2, 4, 5, 0,
       1, 1, 6, 8, 5, 4, 4, 4, 9, 8, 1, 3, 6, 9, 2, 4, 6, 8, 3, 0, 0, 1,
       7, 3, 4, 9, 5, 0, 1, 3, 1, 9, 1, 0, 3, 5, 4, 2, 8, 4, 5, 0, 1, 3,
       1, 2, 4, 7, 8, 9, 9, 5, 6, 5, 9, 7, 7, 1, 2, 7, 3, 6, 6, 2, 4, 6,
       3, 3, 7, 4, 6, 1, 5, 4, 8, 7, 8, 9, 7, 3, 5, 5, 6, 8, 9, 8, 4, 1,
       5, 2, 1, 3, 4, 4, 2, 2, 0, 6, 1, 8, 9, 2, 9, 8, 9, 8, 4, 5, 4, 8,
       9, 1, 5, 8, 0, 9, 8, 7, 3, 1, 2, 2, 1, 5, 9, 1, 3, 8, 3, 6, 5, 7,
       8, 2, 1, 3, 8, 9, 1, 3, 4, 6, 7, 8, 6, 8, 3, 0, 2, 7, 2, 3, 4, 6,
       4, 1, 6, 3, 7, 1, 6, 1, 3, 7, 2, 2, 9, 9, 2, 5, 1, 3, 5, 8, 9, 5,
       1, 5, 2, 6, 4, 0, 0, 1, 3, 1, 0, 9, 7, 5, 2, 8, 2, 5, 5, 3, 5, 6,
       1, 2, 3, 4, 6, 9, 5, 7, 1, 8, 2, 1, 9, 5, 2, 7, 1, 9, 7, 3, 4, 5,
       7, 8, 2, 4, 5, 8, 1, 5, 6, 7, 0, 5, 4, 3, 2, 6, 3, 3, 6, 9, 0, 0,
       7, 2, 7, 3, 9, 1, 7, 3, 1, 0, 6, 1, 7, 2, 2,

And you can even score, by passing the key for the targets:

In [13]:
print(f"The accuracy on the training data was: {pipeline.score(train_data, 'y')*100}%")

The accuracy on the training data was: 100.0%


But what about trying multiple pipelines?

In [14]:
# set the pipeline names to test
pipeline_names = [
                    'standard_scaler--gbt',
                    'gbt',
                    ]

These can also be run in parallel by specifying `n_jobs`:

In [15]:
pscv = sku.PipelineSearchCV(
        pipeline_names=pipeline_names,
        name_to_object=pipeline_objects,
        metrics={'accuracy':accuracy_score},
        cv=StratifiedKFold(n_splits=5, shuffle=True),
        split_fit_on=['X','y'],
        split_transform_on=['X','y'],
        verbose=True,
        n_jobs=2
        )

In [16]:
pscv

In [17]:
results = pscv.fit(train_data)

Searching: 100%|[30m▉▉▉▉▉▉▉▉▉▉[0m| 10/10 [00:19<00:00,  1.96s/it]


In [18]:
results

Unnamed: 0,pipeline,split_number,metric,value,splitter,params,train_id,param_updates
0,standard_scaler--gbt,0,accuracy,0.922222,StratifiedKFold,"{'memory': None, 'steps': [['standard_scaler',...",7254b111-a13c-4db9-a161-1914c0e34e1f,
1,standard_scaler--gbt,1,accuracy,0.888889,StratifiedKFold,"{'memory': None, 'steps': [['standard_scaler',...",7254b111-a13c-4db9-a161-1914c0e34e1f,
2,standard_scaler--gbt,2,accuracy,0.922222,StratifiedKFold,"{'memory': None, 'steps': [['standard_scaler',...",7254b111-a13c-4db9-a161-1914c0e34e1f,
3,standard_scaler--gbt,3,accuracy,0.911111,StratifiedKFold,"{'memory': None, 'steps': [['standard_scaler',...",7254b111-a13c-4db9-a161-1914c0e34e1f,
4,standard_scaler--gbt,4,accuracy,0.933333,StratifiedKFold,"{'memory': None, 'steps': [['standard_scaler',...",7254b111-a13c-4db9-a161-1914c0e34e1f,
5,gbt,0,accuracy,0.922222,StratifiedKFold,"{'memory': None, 'steps': [['gbt', SKModelWrap...",40492f3f-b9eb-4299-84a3-61c625639481,
6,gbt,1,accuracy,0.944444,StratifiedKFold,"{'memory': None, 'steps': [['gbt', SKModelWrap...",40492f3f-b9eb-4299-84a3-61c625639481,
7,gbt,2,accuracy,0.933333,StratifiedKFold,"{'memory': None, 'steps': [['gbt', SKModelWrap...",40492f3f-b9eb-4299-84a3-61c625639481,
8,gbt,3,accuracy,0.922222,StratifiedKFold,"{'memory': None, 'steps': [['gbt', SKModelWrap...",40492f3f-b9eb-4299-84a3-61c625639481,
9,gbt,4,accuracy,0.922222,StratifiedKFold,"{'memory': None, 'steps': [['gbt', SKModelWrap...",40492f3f-b9eb-4299-84a3-61c625639481,
