# Data Science Plumbing: Peeking Into Scikit-Learn Pipelines 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.utils import estimator_html_repr
from IPython.display import HTML

In [2]:
# For Jupyter notebook with Binder
# If the imports fail, uncomment the following and install the libaries below

# !pip install pandas
# !pip install numpy
# !pip install scikit-learn
# !pip install ipython

## Obtaining the Data

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
df['popbin'] = pd.qcut(df['population'], q=3, labels = ['small', 'medium', 'large'])
target = df['median_house_value']
features = df.loc[:, ['popbin', 'median_income', 'ocean_proximity']]

# Randomly assign 10% of observations to be missing
features = features.mask(np.random.random(features.shape) < .1)
features.isnull().values.any()

X_train, X_test, y_train, y_test = train_test_split(
   features,
   target,
   test_size= 0.2,
   random_state= 0)

## Setting up the Pipeline

Create the steps for the transformations.

In [4]:
# steps for standardizing the numerical variables
num_steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
]

num_transformer = Pipeline(steps=num_steps)

# steps for ordinal encoding for the categorical variables
ord_steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OrdinalEncoder())
]

ord_transformer = Pipeline(steps=ord_steps)

# steps for one hot encoding for the categorical variables
ohe_steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
]

ohe_transformer = Pipeline(steps=ohe_steps)

Define the columns for transformation and the order for it with `ColumnTransform()`

In [5]:
# columns to be transformed
num_cols = ['median_income']
ord_cols = ['popbin']
ohe_cols = ['ocean_proximity']

# steps for the transformation
transformer_steps = [
    ('num', num_transformer, num_cols),
    ('ord', ord_transformer, ord_cols),
    ('ohe', ohe_transformer, ohe_cols)
]


Create the final model pipeline.

In [6]:
steps = [
    ('transformation', ColumnTransformer(transformers=transformer_steps)),
    ('linreg', LinearRegression())
]

pipe = Pipeline(steps=steps)
lm = pipe.fit(X_train, y_train)

## Breaking Into the Pipeline

### See All Pipeline Steps

The code below outputs all the steps of the fitted pipeline `lm`

In [7]:
lm.steps

[('transformation',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('standardize',
                                                    StandardScaler())]),
                                   ['median_income']),
                                  ('ord',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('encoding',
                                                    OrdinalEncoder())]),
                                   ['popbin']),
                                  ('ohe',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(fill_value='missing',
                

As an alternative, look at the steps identified in the rendered html below:

In [8]:
with open('lm_pipe.html', 'w') as f:  
    f.write(estimator_html_repr(lm))
HTML(filename="lm_pipe.html")

### Obtain Means/Medians Used For Imputation

Access the `SimpleImputer()` from the pipeline.

*Note: `median` was selected as the imputation strategy earlier*

In [9]:
lm.named_steps['transformation'].named_transformers_['num']['imputer']

SimpleImputer(strategy='median')

Once this has been accessed, it is possible to access the imputation statistic like below:

In [10]:
lm.named_steps['transformation'].named_transformers_['num']['imputer'].statistics_

array([3.5481])

In [11]:
lm.named_steps['transformation'].named_transformers_['ord']['imputer'].statistics_

array(['small'], dtype=object)

### Obtain Mean and Variance Used For Standardization

Similarly, it's possible to extract the mean and variance during the standardization part of the process.

In [12]:
lm.named_steps['transformation'].named_transformers_['num']['standardize'].mean_

array([3.84507467])

In [13]:
lm.named_steps['transformation'].named_transformers_['num']['standardize'].var_

array([3.28633539])

### Extract Feature Names for Numeric and Ordinal Columns

In [14]:
lm.named_steps['transformation'].transformers_

[('num',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                  ('standardize', StandardScaler())]),
  ['median_income']),
 ('ord',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                  ('encoding', OrdinalEncoder())]),
  ['popbin']),
 ('ohe',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant')),
                  ('encoding', OneHotEncoder(handle_unknown='ignore'))]),
  ['ocean_proximity'])]

For numerical variables:

In [15]:
lm.named_steps['transformation'].transformers_[0][2]

['median_income']

Similarly, for ordinal variables:  

In [16]:
lm.named_steps['transformation'].transformers_[1][2]

['popbin']

### Extract Feature Names for One Hot Encoded Columns

First extract the step that does the one hot encoding:

In [17]:
lm.named_steps['transformation'].transformers_[2][1].named_steps['encoding']

OneHotEncoder(handle_unknown='ignore')

Now it's possible to extract the extra created columns with one hot encoding:

In [18]:
lm.named_steps['transformation'].transformers_[2][1].named_steps['encoding'].get_feature_names()

array(['x0_<1H OCEAN', 'x0_INLAND', 'x0_ISLAND', 'x0_NEAR BAY',
       'x0_NEAR OCEAN', 'x0_missing'], dtype=object)

To make the column names more descriptive:

In [19]:
lm.named_steps['transformation'].transformers_[2][1].named_steps['encoding'].get_feature_names(ohe_cols)

array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN', 'ocean_proximity_missing'],
      dtype=object)

### See Coefficients from a Regression Model

Extract the model from the pipeline:

In [20]:
lm.named_steps['linreg']

LinearRegression()

Finally, the code below extracts the coeffients that the model used:

In [21]:
lm.named_steps['linreg'].coef_

array([  67500.35145838,    4099.03711046,  -20896.98599474,
       -102046.16223437,  173664.20500333,   -2201.04941408,
         -6156.4249682 ,  -42363.58239194])