In [1]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union


--------------------------------------------------
What is this talk about ?
    - Machine Learning
        - Python
            - sci-kit learn
                - pipeline
------------------------------------------------                

--------------------------------------------
Agenda
    - sklearn pipelines
    - Modeling example
    - pice by peice walk through
    - data science modeling lifecycle
        - development
        - deployment 
------------------------------------------        

-----------------------------------------
* Any DS or ML pipeline consists of multiple steps -
    
    - fetching data
    - splitting data
    - creating features
    - selecting feature
    - hyper-parameter optimization
    - final model
-------------------------------------------    

-----------------------------
    - iris data
    - present in sklearn
    - data for three type of iris flower
        - Setosa, Versicolour, Virginica
        - attributes are
            - sepal length
            - sepal width
            - petal length
            - petal width
            
-----------------------------------------------


--------------------------------    
    - In this talk, 
        - we try to develop a classification model, and 
        - demonstrate usefulness of sklearn pipeline in the process
--------------------------------------            

In [2]:
from sklearn import datasets

iris = datasets.load_iris()

X = pd.DataFrame(iris.data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

Y = pd.DataFrame(iris.target, columns=['iris_type'])

In [3]:
print(X.head())

   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2


In [4]:
print(Y.head())

   iris_type
0          0
1          0
2          0
3          0
4          0


In [5]:
from sklearn.preprocessing import MinMaxScaler

from sklearn.cross_validation import train_test_split

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

--------------------------------
        Non-pipeline appraoch
------------------------------        

In [7]:
# Train data - Steps
# 1. transform data - min max scaling
# 2. feature selection - use PCA
# 3. final prediction - logistic regression, random forest, gbrt etc.

X_train_non_pipeline = pd.DataFrame(MinMaxScaler().fit_transform(X_train), columns=X_train.columns)

X_train_non_pipeline = PCA().fit_transform(X_train_non_pipeline)

print(X_train_non_pipeline[:5,:2 ])

[[-0.74585898  0.08940225]
 [-0.54574131  0.52234229]
 [ 0.2432025   0.12212192]
 [-0.64190346  0.03415288]
 [-0.71419809 -0.08911659]]


In [8]:
rf = RandomForestClassifier()
rf.fit(X_train_non_pipeline, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
# test data

X_test_non_pipeline = pd.DataFrame(MinMaxScaler().fit_transform(X_test), columns=X_test.columns)

X_test_non_pipeline = PCA().fit_transform(X_test_non_pipeline)
y_test_pred = rf.predict(X_test_non_pipeline)

In [10]:
#getting accuracy

print("Accuracy on traun data ", rf.score(X_train_non_pipeline, y_train))
print("Accuracy on test data ", rf.score(X_test_non_pipeline, y_test))

Accuracy on traun data  0.983333333333
Accuracy on test data  0.3


Pipeline based model building

In [11]:
# creating pipeline which consists of scaler, PCA based feature selection and Random Forest Classifier
pipeline_rf = Pipeline([
        ('min_max_scaler', MinMaxScaler()),
        ('pca_based_feature_selection', PCA()),
        ('random_forest', RandomForestClassifier())
    ])

In [12]:
# fit the pipeline on training data
pipeline_rf.fit(X_train, y_train.values.ravel())

Pipeline(steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca_based_feature_selection', PCA(copy=True, n_components=None, whiten=False)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', m...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [13]:
# get the score of the pipeline on training data

print("Accuracy on traun data ", pipeline_rf.score(X_train, y_train))
print("Accuracy on test data ",pipeline_rf.score(X_test, y_test))


Accuracy on traun data  0.983333333333
Accuracy on test data  0.966666666667


------------------------------
    - Accuracy of pipeline based method on test data is much higher
    - Why so?
    - Let's check if final outputs are different
--------------------------------    

In [14]:
y_test_pred_pipeline = pipeline_rf.predict(X_test)

In [15]:
(y_test_pred == y_test_pred_pipeline).all()

False

-------------------------------------

    - Why are the answers different?

----------------------------------------------
    - Pipeline helps avoid easy and common mistakes which can happen due to time crunch
    - Makes code 
        - more readable
        - easy to debug
        - change and experiment

----------------------------------------------------------
Pipeline
    - combination of transformers, followed by an estimator
    - 'n - 1' transfomer in 'n' step pipeline
    -  n the step has to be an estimator
------------------------------------------------


------------------------------------------
Tranformer:
    - has two main methods
        - fit(X, [y]) : Applies fit on X
        - transform(X, [y]) : Applies transformation on X
    - example: MinMaxScaler()
    
--------------------------------------------

-------------------------------------
Estimator:
    - general term for sklearn object which makes prediction
    - has two main methods
        - fit(X,[y]) - fits the model on given X and y
        - predict(X) - predicts (class or value) for given X
    - example, RandomForestClassifier()

----------------------------------------------------------------------

------------------------------------
Pipeline, again
    - pipeline.fit() - calls fit and transform on each step
        * example, piepline_rf.fit() call hierarchy would be
            - MinMaxScaler().fit_transform().PCA().fit_transform().RandomForestClassifier().fit()
            
---------------------------------------

-------------------------------------------------------------------------
    - pipeline.predict() - calss transform on each step followed by predict
        * example, pipeline_rf.predict() calls hierarchy would be
            - MinMaxScaler().transform().PCA().transform().RandomForestClassifier().predict()
---------------------------------------------------------------




--------------------------------------------------------------
    - to create pipeline, steps are given as a list of tuples
    - each tuple contains the name and class of tranformer followed by estimator
        * example:: Pipeline([
                            ('min_max_scaler', MinMaxScaler()),
                            ('pca_based_feature_selection', PCA()),
                            ('random_forest', RandomForestClassifier())
                            ])
---------------------------------------------------

-----------------------------------------------------------------
    - get the steps of all pipeline using piepline.steps
        * example,

In [16]:
pipeline_rf.steps

[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('pca_based_feature_selection',
  PCA(copy=True, n_components=None, whiten=False)),
 ('random_forest',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False))]

---------------------------------------------------------------------------------
    - set piepline parameters using 'set_params'
        * example,

In [17]:
pipeline_rf.set_params(pca_based_feature_selection__n_components=2)

Pipeline(steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca_based_feature_selection', PCA(copy=True, n_components=2, whiten=False)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [18]:
pipeline_rf.steps[1]

('pca_based_feature_selection', PCA(copy=True, n_components=2, whiten=False))

-------------------------

setting parameters of multiple steps

In [19]:
#example, setting multiple parameters

pipeline_rf.set_params(pca_based_feature_selection__n_components=1, 
                       random_forest__max_features=0.5, 
                       random_forest__max_depth=10)

Pipeline(steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca_based_feature_selection', PCA(copy=True, n_components=1, whiten=False)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.5, max_leaf_...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [20]:
pipeline_rf.steps[2][1]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.5, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
pipeline_rf.steps[1][1]

PCA(copy=True, n_components=1, whiten=False)

---------------------------------

- let's check results with changed parameters

In [22]:
pipeline_rf.fit(X_train, y_train.values.ravel())

pipeline_rf.score(X_test, y_test)

0.96666666666666667

---------------------------------------------------------------

    - hyper-parameter optimization is super easy
    - pipeline supports grid search of parameter in very elegant and easy to use manner
        * example, 
--------------------------------------------------------------        

In [23]:
n_components =[ 1, 2]
max_depth = [1, 10, 20, 30]
max_features= [0.1, 0.2, 0.5 , 0.7]

In [24]:
param_grid ={
    'pca_based_feature_selection__n_components': n_components,
    'random_forest__max_features': max_features,
    'random_forest__max_depth': max_depth
}

In [25]:
from sklearn.grid_search import GridSearchCV

---------------------------------------------------
**GridSearchCV is part of sklearn.model_selection in v0.18**

---------------------------------------------

In [26]:
grid = GridSearchCV(pipeline_rf, cv=3, n_jobs=2, param_grid=param_grid)

In [27]:
grid.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca_based_feature_selection', PCA(copy=True, n_components=1, whiten=False)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.5, max_leaf_...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'random_forest__max_features': [0.1, 0.2, 0.5, 0.7], 'pca_based_feature_selection__n_components': [1, 2], 'random_forest__max_depth': [1, 10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

----------------------------

In [28]:
print("total number of search: ", len(n_components)*len(max_depth)*len(max_features))

total number of search:  32


In [29]:
len(grid.grid_scores_)

32

-----------------------------------
    - Actual total runs would be 32 * 3
-------------------------------------

In [30]:
print("Getting scores for all search")
grid.grid_scores_

Getting scores for all search


[mean: 0.82500, std: 0.10970, params: {'random_forest__max_features': 0.1, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 1},
 mean: 0.77500, std: 0.14371, params: {'random_forest__max_features': 0.2, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 1},
 mean: 0.66667, std: 0.00672, params: {'random_forest__max_features': 0.5, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 1},
 mean: 0.94167, std: 0.03129, params: {'random_forest__max_features': 0.7, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 1},
 mean: 0.88333, std: 0.04570, params: {'random_forest__max_features': 0.1, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 10},
 mean: 0.88333, std: 0.04570, params: {'random_forest__max_features': 0.2, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 10},
 mean: 0.89167, std: 0.04072, params: {'random_forest__max_features'

In [31]:
print("Best set of parameter is \n", grid.best_params_)

Best set of parameter is 
 {'random_forest__max_features': 0.7, 'pca_based_feature_selection__n_components': 1, 'random_forest__max_depth': 1}


----------------------------------------------
    - make_pipeline is shorthand for pipeline
        * example,

In [32]:
make_pipeline(MinMaxScaler(), PCA(), RandomForestClassifier(random_state=100))

Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=Non...timators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False))])

    - name of each step is name of the transformer and estimator in lower case, without any special character in between
    - easier to use, create and remember 
        - easy to forget the names which we give to each step
 -------------------------------------------------------------------------------

In [33]:
from sklearn.pipeline import FeatureUnion

In [34]:
from sklearn.base import TransformerMixin, BaseEstimator


In [35]:
class FactorExtractor( BaseEstimator, TransformerMixin):
    
    def __init__(self, factor=None):
        self.factor = factor
    
    def fit(self,df,y=None ):
        return self
    
    def transform(self, df, y=None):
        if isinstance(self.factor, list):
            return df[self.factor].values
        return df[self.factor].values.reshape(-1,1)

In [36]:
fe = FactorExtractor('sepal_length')

In [37]:
(fe.fit_transform(X).ravel() == X['sepal_length']).all()

True

In [38]:
fu = FeatureUnion(
    [('step1',FactorExtractor('sepal_length')),
    ('step2', FactorExtractor('sepal_width'))
    ])

In [39]:
val = fu.fit_transform(X)

In [40]:
val.shape

(150L, 2L)

--------------------------------
    - FeatureUnion() gives all values in a vector, we need to extract it
---------------

    - So, we need to extract the values from val
-----------------------------

In [41]:
print("Is first 150 value of val similar to sepal length:: ", (val[:,0] == X.sepal_length).all())

print("Is last 150 value of val similar to sepal width:: ", (val[:, 1] == X.sepal_width).all())

Is first 150 value of val similar to sepal length::  True
Is last 150 value of val similar to sepal width::  True


In [42]:
def make_feature_union_df(steps, df):
    fu = FeatureUnion(steps)
    val = fu.fit_transform(df)

    column_name = []
    count = 0
    for st in steps:
        column_name.append(st[1].get_params().values()[0])
        count +=1
    df_transformed = pd.DataFrame(columns=column_name)
    length = int(val.shape[0]/count)
    
    for i in range(count):
        df_transformed.loc[:,column_name[i]] = val[:,i]
    return df_transformed
        
                            
    

In [43]:
steps = [('step1',FactorExtractor('sepal_length')),
    ('step2', FactorExtractor('sepal_width'))]

make_feature_union_df(steps, X).head(3)

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2


In [44]:
make_feature_union_df(steps, X).equals( X[['sepal_length', 'sepal_width']])

True

-----------------------------------------------------
Putting it all together to make final pipeline
    - in final pipeline, we want to use only two features out of 4, which are sepal length and sepal width
------------------------------------------------

In [45]:
fu = FeatureUnion(
    [('step1',FactorExtractor('sepal_length')),
    ('step2', FactorExtractor('sepal_width'))])

In [46]:
pipeline_with_two_features = Pipeline(
    [
        ('feature_union', fu),
        ('min_max', MinMaxScaler()),
        ('pca', PCA()),
        ('random_forest', RandomForestClassifier(random_state=100))
    ])

In [47]:
pipeline_with_two_features.fit(X_train, y_train.values.ravel())

Pipeline(steps=[('feature_union', FeatureUnion(n_jobs=1,
       transformer_list=[('step1', FactorExtractor(factor='sepal_length')), ('step2', FactorExtractor(factor='sepal_width'))],
       transformer_weights=None)), ('min_max', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, n_compo...timators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False))])

In [48]:

print("Train set accuracy with ",
      pipeline_with_two_features.score(X_train, y_train.values.ravel()))

print("Test set accuracy with ",
      pipeline_with_two_features.score(X_test, y_test.values.ravel()))

Train set accuracy with  0.925
Test set accuracy with  0.866666666667


-------------------------------------------------------------
pipeline with features as parameters
--------------------------------------------------------------

In [49]:
pipeline_with_feature_as_parameter = make_pipeline(
        FactorExtractor(),
        MinMaxScaler(),
        PCA(),
        RandomForestClassifier(random_state=100)
)

In [50]:
pipeline_with_feature_as_parameter

Pipeline(steps=[('factorextractor', FactorExtractor(factor=None)), ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_...timators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False))])

In [51]:
factor= [['sepal_length', 'sepal_width'], 
         ['sepal_length', 'sepal_width', 'petal_length'],
         ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],]

In [52]:
param_grid_feature ={
    'factorextractor__factor':factor,
    'pca__n_components': n_components,
    'randomforestclassifier__max_features': max_features,
    'randomforestclassifier__max_depth': max_depth
}

In [None]:
grid_with_feature_as_parameter = GridSearchCV(
    pipeline_with_feature_as_parameter, 
    cv=3, 
    n_jobs=2, 
    param_grid=param_grid_feature
)

In [None]:
grid_with_feature_as_parameter.fit(X_train, y_train.values.ravel())

In [None]:
grid_with_feature_as_parameter.best_params_

In [None]:
print("Train error on best set of hyper-parameters ", 
      grid_with_feature_as_parameter.score(X_train, y_train.values.ravel()))

print("Test error on best set of hyper-parameters ", 
      grid_with_feature_as_parameter.score(X_test, y_test.values.ravel()))

----------------------------------------------------------
    - Using pipeline we can perform selection of feature as well
    
    - Pipeline can be used to write intergration test, so as to evaluate that entire pipeline is working properly (Not covered here)
    
    
    ** This all about development **
-------------------------------------------------------------

-------------------------------------------------------------
    - Now let's look at deployment
        - We know the best Pipeline Parameters
        - Train final pipeline with this best parameters learnt from GridSearchCV() 
-------------------------------------------------------------    

    - set parameters for final pipeline
    - use the best parameters learned from GridSearchCV()
 -----------------------------------------------------------------

In [None]:
pipeline_with_feature_as_parameter.set_params(**grid_with_feature_as_parameter.best_params_)

------------------------------------
    - train the final pipeline
 ----------------------------------

In [None]:
pipeline_with_feature_as_parameter.fit(X_train, y_train.values.ravel())

----------------------------------
    - Predict on train and test data to see if we are getting similar error numbers as that of GridSearchCV()
-----------------------------------

In [None]:
print("Train error on best set of hyper-parameters ", 
      pipeline_with_feature_as_parameter.score(X_train, y_train.values.ravel()))

print("Test error on best set of hyper-parameters ", 
      pipeline_with_feature_as_parameter.score(X_test, y_test.values.ravel()))

---------------------------------------------------
    - Error on train and test is same as that of GridSearchCV()
---------------------------------------------------

    - Now this trained model can be dumped for the deployement
---------------------------------------------------

In [None]:
import joblib

In [None]:
joblib.dump(pipeline_with_feature_as_parameter, '/home/pawan/pipeline_with_feature_as_parameter.pkl')

---------------------------------------------------

    - load the dumped model
------------------------------------------

In [None]:
model = joblib.load('/home/pawan/pipeline_with_feature_as_parameter.pkl')

In [None]:
model.steps

In [None]:
print("Train error on best set of hyper-parameters ", 
      model.score(X_train, y_train.values.ravel()))

print("Test error on best set of hyper-parameters ", 
      model.score(X_test, y_test.values.ravel()))

In [None]:
print("Error on best set of hyper-parameters for complete dataset ", 
      pipeline_with_feature_as_parameter.score(X, Y.values.ravel()))


----------------------------------------------------------------------------------------

** Thanks You!!