# Column Transformer with Mixed Types Code-Along (https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py)

In [2]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [4]:
###Load data from https://www.openml.org/d/40945
X, y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True)

In [6]:
###Define estimators for each variable type to be applied
###Also define as variables the columns/features for which each
###type of transformer is to be applied (e.g., numeric features/numeric transformers, etc.)

###Pre-processing pipeline

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])


In [7]:
###Append classifier to preprocessing pipeline
###This creates a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

clf.fit(X_train, y_train)
print('model score: %.3f' % clf.score(X_test, y_test))

model score: 0.790


In [9]:
###Visualize the pipeline as an HTML representation inside Jupyter notebook

from sklearn import set_config
set_config(display='diagram')
clf

## Use ColumnTransformer by selecting column by data types

In [10]:
subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']

In [11]:
X = X[subset_feature]

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   embarked  1307 non-null   category
 1   sex       1309 non-null   category
 2   pclass    1309 non-null   float64 
 3   age       1046 non-null   float64 
 4   fare      1308 non-null   float64 
dtypes: category(2), float64(3)
memory usage: 33.6 KB


In [14]:
from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

clf.fit(X_train, y_train)
print('model score: %.3f' % clf.score(X_test, y_test))

model score: 0.794


In [20]:
param_grid = {
    'preprocessor__num__imputer__strategy':['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)

%timeit grid_search.fit(X_train, y_train)

print(('best logistic regression from grid search %.3f'
      % grid_search.score(X_test, y_test)))

3.56 s ± 404 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
best logistic regression from grid search 0.794
