# This Jupyter Notebook contains the full code needed to write the ColumnTransformer blog

## Import Necessary Packages

In [65]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

from pytz import timezone

## Import Data and some pre-transformation data prep

In [2]:
# read the csvs with waits and weather
df = pd.read_csv('./data/dec2019.csv')
weather_df = pd.read_csv('./data/dec2019weather.csv')

In [3]:
# rename the columns
df.columns = ['date_hour', 'wait_hrs']

# cut the date_hours to the hour (no minutes/seconds) and convert to string for merging
df['date_hour'] = pd.to_datetime(df['date_hour'], utc=True).values.astype('datetime64[h]')
df['date_hour'] = df['date_hour'].astype('str')

In [4]:
# create dataframe of all possible departure hours in the month (as string for merging)
# note that I chose to include non-ferry service hours at this stage
dts = pd.DataFrame(columns=['date_hour'])
dts['date_hour'] = pd.date_range(start='2019-12-01 00:00', 
                    end='2019-12-31 23:30', 
                    freq='H',
                   ).astype('str')

In [5]:
# merge/join the waits to the dataframe of all departures
df_expanded = dts.merge(df, how='left', on='date_hour')

# cast as datetime with timezone UTC
df_expanded['date_hour'] = pd.to_datetime(df_expanded['date_hour'], utc=True)

# adjust time to PST
df_expanded['date_hour'] = [dt.astimezone(timezone('US/Pacific')) for dt in df_expanded['date_hour']]

# remove non-sailing times (1 to 4 am for Edmonds (1-3 for Kingston))
df_expanded = df_expanded.set_index('date_hour')
df_expanded = df_expanded.between_time('5:00', '00:59')

# reset index for modeling
df_expanded = df_expanded.reset_index()

In [6]:
weather_df.columns = ['date', 'max_temp', 'avg_temp', 'min_temp']

In [7]:
weather_df['date'] = pd.to_datetime(weather_df['date'])

In [8]:
df_expanded['date'] = pd.to_datetime(df_expanded['date_hour']).values.astype('datetime64[D]')
df_expanded = df_expanded.merge(weather_df, how='left', on='date')
df_expanded.head()

Unnamed: 0,date_hour,wait_hrs,date,max_temp,avg_temp,min_temp
0,2019-11-30 16:00:00-08:00,,2019-12-01,45,42.7,39
1,2019-11-30 17:00:00-08:00,,2019-12-01,45,42.7,39
2,2019-11-30 18:00:00-08:00,,2019-12-01,45,42.7,39
3,2019-11-30 19:00:00-08:00,,2019-12-01,45,42.7,39
4,2019-11-30 20:00:00-08:00,,2019-12-01,45,42.7,39


## Simple Column Transformer Example

In [9]:
# a little cheating to extract the day of the week 
# and hour of the day w/out using a transformer 
# (see below for the "real" version)
df_simple = df_expanded.copy()
df_simple['weekday'] = [dt.weekday() for dt in df_simple['date_hour']]
df_simple['hour'] = [dt.hour for dt in df_simple['date_hour']]

In [10]:
df_simple.head()

Unnamed: 0,date_hour,wait_hrs,date,max_temp,avg_temp,min_temp,weekday,hour
0,2019-11-30 16:00:00-08:00,,2019-12-01,45,42.7,39,5,16
1,2019-11-30 17:00:00-08:00,,2019-12-01,45,42.7,39,5,17
2,2019-11-30 18:00:00-08:00,,2019-12-01,45,42.7,39,5,18
3,2019-11-30 19:00:00-08:00,,2019-12-01,45,42.7,39,5,19
4,2019-11-30 20:00:00-08:00,,2019-12-01,45,42.7,39,5,20


In [11]:
X = df_simple.drop(columns='wait_hrs')
y = df_simple['wait_hrs'].fillna(value=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111)

In [12]:
# define column transformer and set n_jobs to have it run on all cores
col_transformer = ColumnTransformer(
                    transformers=[
                        ('ss', StandardScaler(), ['max_temp', 'avg_temp', 'min_temp']),
                        ('ohe', OneHotEncoder(), ['weekday', 'hour'])],
                    remainder='drop',
                    n_jobs=-1
                    )

In [13]:
X_train_transformed = col_transformer.fit_transform(X_train)

In [14]:
X_train_transformed

<465x30 sparse matrix of type '<class 'numpy.float64'>'
	with 2325 stored elements in Compressed Sparse Row format>

In [15]:
lr = LinearRegression()

pipe = Pipeline([
            ("preprocessing", col_transformer),
            ("lr", lr)
       ])

In [16]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ss',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['max_temp', 'avg_temp',
                                                   'min_temp']),
                                                 ('ohe',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                        

In [17]:
preds_train = pipe.predict(X_train)
preds_test = pipe.predict(X_test)

In [18]:
preds_train[0:5]

array([ 0.03650713, -0.17621553, -0.17410003,  0.43019621, -0.03297714])

In [19]:
preds_test[0:5]

array([ 0.34257865,  0.26820199,  0.230672  ,  0.13353943, -0.05221186])

In [20]:
col_transformer.get_feature_names

<bound method ColumnTransformer.get_feature_names of ColumnTransformer(n_jobs=-1, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('ss',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['max_temp', 'avg_temp', 'min_temp']),
                                ('ohe',
                                 OneHotEncoder(categorical_features=None,
                                               categories=None, drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               n_values=None, sparse=True),
                                 ['weekday', 'hour'])],
                  verbose=False)>

In [21]:
col_transformer.named_transformers_['ohe'].get_feature_names()

array(['x0_0.0', 'x0_1.0', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0',
       'x0_6.0', 'x1_0.0', 'x1_5.0', 'x1_6.0', 'x1_7.0', 'x1_8.0',
       'x1_9.0', 'x1_10.0', 'x1_11.0', 'x1_12.0', 'x1_13.0', 'x1_14.0',
       'x1_15.0', 'x1_16.0', 'x1_17.0', 'x1_18.0', 'x1_19.0', 'x1_20.0',
       'x1_21.0', 'x1_22.0', 'x1_23.0'], dtype=object)

In [22]:
for transformer in col_transformer.named_transformers_.values():
    try:
        transformer.get_feature_names()
    except:
        print('SS col')
    else:
        print(transformer.get_feature_names())

SS col
['x0_0.0' 'x0_1.0' 'x0_2.0' 'x0_3.0' 'x0_4.0' 'x0_5.0' 'x0_6.0' 'x1_0.0'
 'x1_5.0' 'x1_6.0' 'x1_7.0' 'x1_8.0' 'x1_9.0' 'x1_10.0' 'x1_11.0'
 'x1_12.0' 'x1_13.0' 'x1_14.0' 'x1_15.0' 'x1_16.0' 'x1_17.0' 'x1_18.0'
 'x1_19.0' 'x1_20.0' 'x1_21.0' 'x1_22.0' 'x1_23.0']
SS col


## More complex column transformer example: imputing THEN standard scale/ohe

In [23]:
# define transformers
si_0 = SimpleImputer(strategy='constant', fill_value=0)
ss = StandardScaler()
ohe = OneHotEncoder()

# define column groups with same processing
cat_vars = ['weekday', 'hour']
num_vars = ['max_temp', 'avg_temp', 'min_temp']

# set up pipelines for each column group
categorical_pipe = Pipeline([
                        ('si_0', si_0), 
                        ('ohe', ohe)
                    ])
numeric_pipe = Pipeline([
                    ('si_0', si_0), 
                    ('ss', ss)
                    ])

# set up columnTransformer
col_transformer = ColumnTransformer(
                    transformers=[
                        ('nums', numeric_pipe, num_vars),
                        ('cats', categorical_pipe, cat_vars)
                    ],
                    remainder='drop',
                    n_jobs=-1
                    )

In [24]:
pipe = Pipeline([
            ("preprocessing", col_transformer),
            ("lr", lr)
       ])

In [25]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('nums',
                                                  Pipeline(memory=None,
                                                           steps=[('si_0',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                                 

In [26]:
preds_train = pipe.predict(X_train)
preds_test = pipe.predict(X_test)

In [27]:
preds_train[0:10]

array([ 0.03650713, -0.17621553, -0.17410003,  0.43019621, -0.03297714,
        0.02531061,  0.49059563,  0.1217838 , -0.01996503,  0.00551134])

In [28]:
preds_test[0:10]

array([ 0.34257865,  0.26820199,  0.230672  ,  0.13353943, -0.05221186,
        0.31987361,  0.46759719,  0.26077382,  0.09713265,  0.33097677])

In [29]:
col_transformer.named_transformers_['cats'].named_steps['ohe'].get_feature_names()

array(['x0_0.0', 'x0_1.0', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0',
       'x0_6.0', 'x1_0.0', 'x1_5.0', 'x1_6.0', 'x1_7.0', 'x1_8.0',
       'x1_9.0', 'x1_10.0', 'x1_11.0', 'x1_12.0', 'x1_13.0', 'x1_14.0',
       'x1_15.0', 'x1_16.0', 'x1_17.0', 'x1_18.0', 'x1_19.0', 'x1_20.0',
       'x1_21.0', 'x1_22.0', 'x1_23.0'], dtype=object)

## Create your own custom transformer

In [147]:
from sklearn.base import TransformerMixin, BaseEstimator

class DateTransformer(TransformerMixin, BaseEstimator):
    """Extracts features from datetime column
    
    Returns:
      hour: hour
      day: Between 1 and the number of days in the given month of the given year.
      month: Between 1 and 12 inclusive.
      year: four-digit year
      weekday:day of the week as an integer, where Monday is 0 and Sunday is 6
   """

    def fit(self, x, y=None):
        return self

    
    def transform(self, x, y=None):
        result = pd.DataFrame(x, columns=['date_hour'])
        result['hour'] = [dt.hour for dt in result['date_hour']]
        result['day'] = [dt.day for dt in result['date_hour']]
        result['month'] = [dt.month for dt in result['date_hour']]
        result['year'] = [dt.year for dt in result['date_hour']]
        result['weekday'] = [dt.weekday() for dt in result['date_hour']]
        return result[['hour', 'day', 'month', 'year', 'weekday']]
    
    
    def get_feature_names(self):
        return ['hour','day', 'month', 'year', 'weekday']

In [148]:
X = df_expanded.drop(columns='wait_hrs')
y = df_simple['wait_hrs'].fillna(value=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111)

In [149]:
X.head()

Unnamed: 0,date_hour,date,max_temp,avg_temp,min_temp
0,2019-11-30 16:00:00-08:00,2019-12-01,45,42.7,39
1,2019-11-30 17:00:00-08:00,2019-12-01,45,42.7,39
2,2019-11-30 18:00:00-08:00,2019-12-01,45,42.7,39
3,2019-11-30 19:00:00-08:00,2019-12-01,45,42.7,39
4,2019-11-30 20:00:00-08:00,2019-12-01,45,42.7,39


In [150]:
time_preprocessing = Pipeline([
                            ('date', DateTransformer()),
                            ('ohe', OneHotEncoder(categories='auto'))
                        ])

ct = ColumnTransformer(
                    transformers=[
                        ('ss', StandardScaler(), ['max_temp', 'avg_temp', 'min_temp']),
                        ('date_exp', time_preprocessing, ['date_hour'])],
                    remainder='drop',
                    )

pipe = Pipeline([('preprocessor', ct),
                 ('lr', lr)])

In [151]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ss',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['max_temp', 'avg_temp',
                                                   'min_temp']),
                                                 ('date_exp',
                                                  Pipeline(memory=None,
                                                           steps=[('date',
                                                                   DateTransformer()),
                               

In [140]:
preds_train = pipe.predict(X_train)
preds_test = pipe.predict(X_test)

In [141]:
lr.coef_

array([-6.82076705e-02,  2.62009695e-02, -5.21708067e-02, -6.52233237e-02,
       -1.11211785e-01, -9.46140689e-02, -9.04810834e-02, -1.11230439e-01,
       -9.62220441e-02, -7.99745376e-02, -7.21982462e-02,  1.62152613e-01,
        1.20068920e-01,  1.21630847e-01,  3.53592239e-01,  2.88009948e-01,
        1.63729282e-01, -4.27206526e-02, -8.15189424e-02, -7.12604340e-02,
       -1.00026123e-01, -9.98540705e-02, -9.26480995e-02, -7.06554219e-02,
       -3.81379900e-02, -6.29363481e-02,  7.59606596e-03, -4.60395030e-02,
       -2.92341090e-02,  6.01875823e-03,  2.37595657e-02, -1.55318118e-01,
       -1.44277277e-01, -1.31853503e-03, -1.31386259e-01, -1.06119083e-02,
       -9.77607993e-02, -7.09326755e-02, -1.41606296e-01, -1.89719433e-01,
       -6.25971437e-02,  1.19002353e-01,  5.29794446e-02, -1.59556621e-02,
       -5.71940050e-02,  2.93585503e-01,  1.72053864e-01,  7.57916892e-03,
        1.37067836e-01,  6.40806520e-02,  1.33687587e-01,  6.27911284e-02,
       -1.86649232e-02,  

In [142]:
ct.named_transformers_['date_exp'].named_steps['ohe'].get_feature_names()

array(['x0_0', 'x0_5', 'x0_6', 'x0_7', 'x0_8', 'x0_9', 'x0_10', 'x0_11',
       'x0_12', 'x0_13', 'x0_14', 'x0_15', 'x0_16', 'x0_17', 'x0_18',
       'x0_19', 'x0_20', 'x0_21', 'x0_22', 'x0_23', 'x1_1', 'x1_2',
       'x1_3', 'x1_4', 'x1_5', 'x1_6', 'x1_7', 'x1_8', 'x1_9', 'x1_10',
       'x1_11', 'x1_12', 'x1_13', 'x1_14', 'x1_15', 'x1_16', 'x1_17',
       'x1_18', 'x1_19', 'x1_20', 'x1_21', 'x1_22', 'x1_23', 'x1_24',
       'x1_25', 'x1_26', 'x1_27', 'x1_28', 'x1_29', 'x1_30', 'x1_31',
       'x2_11', 'x2_12', 'x3_2019', 'x4_0', 'x4_1', 'x4_2', 'x4_3',
       'x4_4', 'x4_5', 'x4_6'], dtype=object)

In [143]:
ct.named_transformers_['date_exp'].named_steps['date'].get_feature_names()

['hour', 'day', 'month', 'year', 'weekday']

## Rare features with ColumnTransformer

In [197]:
df = pd.DataFrame()
df['cat1'] = [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
df['cat2'] = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
df['num1'] = [np.nan, 1, 1.1, .9, .8, np.nan, 2, 2.2, 1.5, np.nan]
df['num2'] = [1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2]

target = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

X_train, X_test, y_train, y_test = train_test_split(df, target, random_state=111)

In [198]:
num_pipe = Pipeline([
                ('si', SimpleImputer(add_indicator=True)),
                ('ss', StandardScaler())
            ])

ct = ColumnTransformer(
            transformers=[('ohe', OneHotEncoder(categories=[[0,1], [0,2]]), ['cat1', 'cat2']),
                          ('numeric', num_pipe, ['num1', 'num2'])])

In [199]:
pipe = Pipeline([
            ('preprocessor', ct),
            ('lr', lr)
])

In [200]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ohe',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=[[0,
                                                                             1],
                                                                            [0,
                                                                             2]],
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                

In [201]:
preds_train = pipe.predict(X_train)

In [202]:
preds_test = pipe.predict(X_test)

In [205]:
ct.fit_transform(X_train)

array([[ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -3.04388929e-01, -8.66025404e-01,
        -4.08248290e-01],
       [ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -5.65293726e-01, -8.66025404e-01,
        -4.08248290e-01],
       [ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -8.26198522e-01, -8.66025404e-01,
        -4.08248290e-01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  2.04375424e+00,  1.15470054e+00,
        -4.08248290e-01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  7.39230257e-01,  1.15470054e+00,
        -4.08248290e-01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00, -5.79325024e-16,  1.15470054e+00,
         2.44948974e+00],
       [ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -1.08710332e+00, -8.66025404e-01,
        -4.0824829

In [206]:
ct.fit_transform(X_test)

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.70710678, -1.41421356],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        -1.41421356,  0.70710678],
       [ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.70710678,  0.70710678]])