# Supervised Learning Pipeline with sklearn

In [17]:
import pandas as pd
import numpy as np

from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
    PowerTransformer,
    OneHotEncoder,
)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

### Create model switcher class for pipeline grid search

In [18]:
from sklearn.base import BaseEstimator

class ModelSwitcher(BaseEstimator):
    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A custome BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - the classifier
        """
        self.estimator = estimator
        
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    
    def score(self, X, y):
        return self.estimator.score(X, y)

### Make preprocessor with ColumnTransformer

In [19]:
def make_preprocessor(
    numeric_features, 
    categorical_features,
    polynomial_features,
    discretized_features,
    mapped_features,
):
    if numeric_features:
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
    else:
        numeric_transformer = 'drop'
        
    if categorical_features:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
    else:
        categorical_transformer = 'drop'
        
    if polynomial_features:
        polynomial_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('polynomial', PolynomialFeatures(degree=2, interaction_only=True)),
            ('scaler', StandardScaler())
        ])
    else:
        discretized_transformer = 'drop'
        
    if discretized_features:
        discretized_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot')),
        ])
    else:
        discretized_transformer = 'drop'
    
    if mapped_features:
        mapped_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('mapper', PowerTransformer()),
        ])
    else:
        mapped_transformer = 'drop'
        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('poly', polynomial_transformer, polynomial_features),
            ('discret', discretized_transformer, discretized_features),
            ('map', mapped_transformer, mapped_features),
        ])
    
    return preprocessor

### Make Grid Search on pipeline

In [20]:
def make_grid_search(estimator, param_grid, scoring, n_jobs, cv):
    if not param_grid:
        raise ValueError('Grid Search Hyperparameters is None!')
    else:
        grid = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=scoring, 
            n_jobs=n_jobs, 
            cv=cv
        )
    
    return grid

**=======================Your program start from here**=========================

### Load data

In [30]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True, parser='auto')
y = y.map({'1': 1, '0':0}).astype(int)

### Manually set hyperparameters for pipeline grid search

In [33]:
#==========================SELECT FEATURES FOR PREPROCESSING PATHS HERE================================
# select what features to use for model training
numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']
polynomial_features = ['age', 'fare']
discretized_features = ['fare']
mapped_features = ['age']

#================================ADD PREPROCESSING METHODS HERE========================================
# preprocessor hyperparameters needs to be set according to the above selected features
if numeric_features:
    param_num_preprocessor = {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler()]
    }
else:
    param_num_preprocessor = {}
    
if categorical_features:
    param_cat_preprocessor = {
        'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    }
else:
    param_cat_preprocessor = {}

if polynomial_features:
    param_poly_preprocessor = {
        'preprocessor__poly__imputer__strategy': ['mean'],
        'preprocessor__poly__polynomial': [PolynomialFeatures(degree=2, interaction_only=True)],
        'preprocessor__poly__scaler': [StandardScaler()],
    }
else:
    param_poly_preprocessor = {}
    
if discretized_features:
    param_discret_preprocessor = {
        'preprocessor__discret__imputer__strategy': ['mean'],
        'preprocessor__discret__discretizer': [KBinsDiscretizer()],
        'preprocessor__discret__discretizer__n_bins': [3, 5],
    }
else:
    param_discret_preprocessor = {}

if mapped_features:
    param_map_preprocessor = {
        'preprocessor__map__imputer__strategy': ['mean'],
        'preprocessor__map__mapper': [PowerTransformer()],
    }
else:
    param_map_preprocessor = {}
    
# concate num/cat/poly/discret/map preprocessors to get the final preprocessor
param_preprocessor = {
    **param_num_preprocessor,
    **param_cat_preprocessor,
    **param_poly_preprocessor,
    **param_discret_preprocessor,
    **param_map_preprocessor,
}

#=================================ADD DIFFERENT MODELS HERE=================================
# set hyperparameters for different candidate estimators
param_model = [
    {
        'model_switcher__estimator': [SGDClassifier()],
        'model_switcher__estimator__penalty': ['l1', 'l2', 'elasticnet'],
        'model_switcher__estimator__max_iter': [150],
        'model_switcher__estimator__tol': [1e-4],
        'model_switcher__estimator__loss': ['hinge'],
    },
    # {
    #     'model_switcher__estimator': [LogisticRegression()],
    #     'model_switcher__estimator__C': [5, 10, 20],
    # },
]

# concate preprocessor hyperparameters and model prameters to get grid search parameters
param_grid = [{**param, **param_preprocessor} for param in param_model]

# make preprocessor
preprocessor = make_preprocessor(
    numeric_features=numeric_features,
    categorical_features=categorical_features,
    polynomial_features=polynomial_features,
    discretized_features=discretized_features,
    mapped_features=mapped_features,
)

# make the final pipeline
final_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_switcher', ModelSwitcher()),
])

# make grid search
grid = make_grid_search(
    estimator=final_pipe,
    param_grid=param_grid,
    scoring='f1',
    n_jobs=1,
    cv=3,
)

# illustrate the final pipeline diagram
set_config(display='diagram')
display(grid)

### start grid search on pipeline

In [35]:
grid.fit(X, y)

print('Best parameter set:')
for key, value in grid.best_params_.items():
    print(key, ':', value)

print('\nBest F1 score:', grid.best_score_)

Best parameter set:
model_switcher__estimator : SGDClassifier(max_iter=150, penalty='elasticnet', tol=0.0001)
model_switcher__estimator__loss : hinge
model_switcher__estimator__max_iter : 150
model_switcher__estimator__penalty : elasticnet
model_switcher__estimator__tol : 0.0001
preprocessor__cat__imputer__strategy : constant
preprocessor__discret__discretizer : KBinsDiscretizer(n_bins=3)
preprocessor__discret__discretizer__n_bins : 3
preprocessor__discret__imputer__strategy : mean
preprocessor__map__imputer__strategy : mean
preprocessor__map__mapper : PowerTransformer()
preprocessor__num__imputer__strategy : median
preprocessor__num__scaler : StandardScaler()
preprocessor__poly__imputer__strategy : mean
preprocessor__poly__polynomial : PolynomialFeatures(interaction_only=True)
preprocessor__poly__scaler : StandardScaler()

Best F1 score: 0.6393085497651785
