# pls use scikit-learn pipelines for dataprep and cross validation

<img src ='IMG_6352.jpg' style='width: 500px' align='left'>

### environment

In [1]:
import sys
sys.version

'3.7.4 (default, Aug 13 2019, 15:17:50) \n[Clang 4.0.1 (tags/RELEASE_401/final)]'

### imports

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#libraries
######################################################################

import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

#configuration options
######################################################################

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

seed = 12345

In [3]:
pd.__version__

'1.2.5'

In [4]:
np.__version__

'1.19.5'

In [5]:
sklearn.__version__

'0.24.1'

## dataset

In [6]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [7]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [8]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [9]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## pandas vs scikit-learn

### na imputation

the `pandas` way

In [13]:
X_train.shape

(1047, 13)

In [11]:
X_train.isna().sum()

pclass         0
name           0
sex            0
age          203
sibsp          0
parch          0
ticket         0
fare           1
cabin        820
embarked       1
boat         676
body         950
home.dest    438
dtype: int64

In [12]:
X_train.dtypes

pclass        float64
name           object
sex          category
age           float64
sibsp         float64
parch         float64
ticket         object
fare          float64
cabin          object
embarked     category
boat           object
body          float64
home.dest      object
dtype: object

In [14]:
X_copy = X_train.copy()

you can separate categorical from numeric and impute with different techniques

In [18]:
X_copy.select_dtypes('number').head(1)

Unnamed: 0,pclass,age,sibsp,parch,fare,body
0,1.0,29.0,0.0,0.0,211.3375,


In [19]:
X_copy.select_dtypes('object').head(1)

Unnamed: 0,name,ticket,cabin,boat,home.dest
0,"Allen, Miss. Elisabeth Walton",24160,B5,2,"St Louis, MO"


you have to use external ways to store the input values

In [23]:
#impute_values = {'age': X_copy['age'].median(), 'cabin':'Other'}

In [20]:
X_copy['age'] = X_copy['age'].fillna(X_copy['age'].median())

In [21]:
X_copy['cabin'] = X_copy['cabin'].fillna('Other')

In [22]:
X_copy.isna().sum()

pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           1
cabin          0
embarked       1
boat         676
body         950
home.dest    438
dtype: int64

In [24]:
#X_test['age'] = X_test['age'].fillna(impute_values['age'])

the `scikit-learn` way

In [26]:
from sklearn.impute import SimpleImputer

In [25]:
X_train_num = X_train.select_dtypes('number')
X_test_num = X_test.select_dtypes('number')

In [27]:
X_train_num.head(1)

Unnamed: 0,pclass,age,sibsp,parch,fare,body
0,1.0,29.0,0.0,0.0,211.3375,


In [28]:
X_test_num.head(1)

Unnamed: 0,pclass,age,sibsp,parch,fare,body
1097,3.0,6.0,3.0,1.0,21.075,


In [29]:
imputer = SimpleImputer(strategy = 'median')

In [30]:
imputer.fit(X_train_num)

SimpleImputer(strategy='median')

In [31]:
X_train_num.isna().sum()

pclass      0
age       203
sibsp       0
parch       0
fare        1
body      950
dtype: int64

In [33]:
X_train_num.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'body'], dtype='object')

In [32]:
imputer.statistics_

array([  3. ,  28. ,   0. ,   0. ,  14.5, 165. ])

In [34]:
X_train_imp = imputer.transform(X_train_num)
X_test_imp = imputer.transform(X_test_num)

### one hot encoding

In [38]:
X_train.head(1)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"


In [39]:
X_train_cat = X_train[['embarked', 'sex', 'pclass']]
X_test_cat = X_test[['embarked', 'sex', 'pclass']]

the `pandas` way

In [40]:
pd.get_dummies(X_train_cat).head(1)

Unnamed: 0,pclass,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,1.0,0,0,1,1,0


In [41]:
pd.get_dummies(X_test_cat).head(1)

Unnamed: 0,pclass,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
1097,3.0,0,0,1,0,1


the `scikit-learn` way

In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [44]:
ohe.fit(X_train_cat)

OneHotEncoder(handle_unknown='ignore')

In [45]:
ohe.categories_

[array(['C', 'Q', 'S', nan], dtype=object),
 array(['female', 'male'], dtype=object),
 array([1., 2., 3.])]

In [47]:
ohe.get_feature_names(input_features = X_train_cat.columns)

array(['embarked_C', 'embarked_Q', 'embarked_S', 'embarked_nan',
       'sex_female', 'sex_male', 'pclass_1.0', 'pclass_2.0', 'pclass_3.0'],
      dtype=object)

In [48]:
ohe.transform(X_train_cat)

<1047x9 sparse matrix of type '<class 'numpy.float64'>'
	with 3141 stored elements in Compressed Sparse Row format>

In [49]:
ohe.transform(X_test_cat)

<262x9 sparse matrix of type '<class 'numpy.float64'>'
	with 786 stored elements in Compressed Sparse Row format>

## full dataprep with scikit pipelines

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [51]:
pipe = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                        ('scaler', StandardScaler())])

In [52]:
pipe.fit(X_train_num)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [56]:
pipe['imputer']

array([  3. ,  28. ,   0. ,   0. ,  14.5, 165. ])

In [54]:
pipe['scaler']

StandardScaler()

In [57]:
pipe.transform(X_train_num)

array([[-1.56053521, -0.0508991 , -0.47518462, -0.45030234,  3.76745111,
         0.00440384],
       [-0.35649662, -0.90356063, -0.47518462, -0.45030234,  0.87198647,
         0.00440384],
       [-0.35649662, -0.51598721, -0.47518462, -0.45030234, -0.35592371,
         0.00440384],
       ...,
       [-0.35649662, -0.12841378, -0.47518462, -0.45030234, -0.23084762,
         0.00440384],
       [-1.56053521,  0.41418902, -0.47518462, -0.45030234,  3.77086465,
         0.00440384],
       [-0.35649662, -0.98107532, -0.47518462, -0.45030234, -0.4199049 ,
         0.00440384]])

In [58]:
pipe.transform(X_test_num)

array([[ 0.84754197, -1.83373686,  2.325106  ,  0.69681822, -0.22927215,
         0.00440384],
       [-1.56053521,  1.0343065 ,  0.45824558, -0.45030234,  0.49264725,
         0.00440384],
       [ 0.84754197, -0.12841378, -0.47518462, -0.45030234, -0.50918194,
         0.00440384],
       ...,
       [ 0.84754197, -0.59350189,  0.45824558, -0.45030234, -0.3799928 ,
         0.00440384],
       [-1.56053521,  0.02661559,  0.45824558,  1.84393877,  2.51153315,
        -1.00526497],
       [ 0.84754197, -0.12841378, -0.47518462, -0.45030234, -0.52012206,
         0.00440384]])

### todo en un solo paso con ColumnTransformer

numeric dataprep

In [59]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical dataprep

In [60]:
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

both together with `ColumnTransformer`

In [61]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

In [62]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'fare']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['embarked', 'sex', 'pclass'])])

In [63]:
from sklearn import set_config

set_config(display='diagram')

In [64]:
preprocessor

In [65]:
preprocessor.named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('ohe', OneHotEncoder(handle_unknown='ignore'))]),
 'remainder': 'drop'}

access individual elements with keys

In [66]:
preprocessor.named_transformers_['num']['imputer']

we can also use the pipeline to transform the test in one line

In [67]:
preprocessor.transform(X_test)

array([[-1.83373686, -0.22927215,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.0343065 ,  0.49264725,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.12841378, -0.50918194,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.59350189, -0.3799928 ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.02661559,  2.51153315,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.12841378, -0.52012206,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

<img src='https://memegenerator.net/img/instances/76641465.jpg' align='left' style='width: 500px'>

## pipelines with training

In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression())])

classifier.fit(X_train, y_train)

In [70]:
classifier.score(X_train, y_train)

0.7851002865329513

In [71]:
classifier.score(X_test, y_test)

0.8053435114503816

## pipelines with grid search

In [72]:
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

In [73]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', DecisionTreeClassifier())])

In [74]:
classifier.get_params()

{'memory': None,
 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['age', 'fare']),
                                   ('cat',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['embarked', 'sex', 'pclass'])])),
  ('classifier', DecisionTreeClassifier())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer',
             

In [75]:
classifier

In [76]:
param_grid = {'preprocessor__num__imputer__strategy': ['mean', 'median'],
              'classifier__max_depth': [2, 4, 6, 8, 10],
              'classifier__min_samples_split': [10, 15, 20]}

In [77]:
gs = GridSearchCV(classifier, param_grid=param_grid, scoring = 'roc_auc', cv = 3)

In [78]:
gs.fit(X_train, y_train)

In [79]:
gs.best_params_

{'classifier__max_depth': 6,
 'classifier__min_samples_split': 20,
 'preprocessor__num__imputer__strategy': 'median'}

In [82]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__min_samples_split,param_preprocessor__num__imputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01791,0.002086,0.009397,0.001732,2,10,mean,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
1,0.017166,0.002632,0.009016,0.001306,2,10,median,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
2,0.016655,0.001997,0.009111,0.001023,2,15,mean,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
3,0.019861,0.00081,0.009355,0.000528,2,15,median,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
4,0.019269,0.000815,0.009492,0.00047,2,20,mean,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
5,0.020601,0.000727,0.010721,0.000558,2,20,median,"{'classifier__max_depth': 2, 'classifier__min_...",0.78547,0.809931,0.754947,0.826128,0.811486,0.797592,0.025001,21
6,0.015431,0.000894,0.007707,0.000448,4,10,mean,"{'classifier__max_depth': 4, 'classifier__min_...",0.794356,0.853383,0.811931,0.861595,0.819796,0.828212,0.025417,8
7,0.015762,0.001503,0.008574,0.001304,4,10,median,"{'classifier__max_depth': 4, 'classifier__min_...",0.794356,0.851479,0.811931,0.861595,0.815987,0.82707,0.025338,11
8,0.019137,0.001152,0.009873,0.000557,4,15,mean,"{'classifier__max_depth': 4, 'classifier__min_...",0.794356,0.853383,0.811931,0.861595,0.821082,0.82847,0.025337,6
9,0.018188,0.001573,0.009249,0.001202,4,15,median,"{'classifier__max_depth': 4, 'classifier__min_...",0.794356,0.851479,0.811931,0.861595,0.821082,0.828089,0.024971,10


In [80]:
gs.score(X_train, y_train)

0.8932626067787269

In [81]:
gs.score(X_test, y_test)

0.8620527306967984

<img src='IMG_6355.jpg' style='width: 500px' align='left'>