# Sklearn pipeline

In [1]:
%%capture
%run ./001_dataset-regression.ipynb

In [2]:
import pandas as pd
from sklearn_pandas import DataFrameMapper

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

# Load Data and split in train - test
#Xy_train_df = pd.read_csv("Xy_train.csv")
#Xy_test_df = pd.read_csv("Xy_test.csv")

# Mapper to separeta X from y 
mapper_X = DataFrameMapper([(['X'], None)])
mapper_y = DataFrameMapper([(['y'], None)])

In [3]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

# Create a pipeline
reg_pipe = Pipeline(
    [('feature_mapper', mapper_X),
     ('polynomial_features',PolynomialFeatures()),
     ('model', DummyEstimator())
     ])

search_space =  [dict(polynomial_features=['passthrough',PolynomialFeatures()],
                      model=[DecisionTreeRegressor()],
                      model__max_depth=[1, 2, 3]),
                 dict(model=[LinearRegression()],model__fit_intercept=[True,False])]

# Create grid search 
gs = GridSearchCV(reg_pipe, search_space)

In [4]:
gs.fit(Xy_train_df,mapper_y.fit_transform(Xy_train_df))

GridSearchCV(estimator=Pipeline(steps=[('feature_mapper',
                                        DataFrameMapper(features=[(['X'],
                                                                   None)])),
                                       ('polynomial_features',
                                        PolynomialFeatures()),
                                       ('model', DummyEstimator())]),
             param_grid=[{'model': [DecisionTreeRegressor(max_depth=3)],
                          'model__max_depth': [1, 2, 3],
                          'polynomial_features': ['passthrough',
                                                  PolynomialFeatures()]},
                         {'model': [LinearRegression()],
                          'model__fit_intercept': [True, False]}])

In [5]:
pd.DataFrame(gs.cv_results_).iloc[7]

mean_fit_time                                                        0.00589619
std_fit_time                                                         0.00304638
mean_score_time                                                       0.0156991
std_score_time                                                        0.0261977
param_model                                                  LinearRegression()
param_model__max_depth                                                      NaN
param_polynomial_features                                                   NaN
param_model__fit_intercept                                                False
params                        {'model': LinearRegression(), 'model__fit_inte...
split0_test_score                                                      0.471647
split1_test_score                                                      0.700336
split2_test_score                                                     -0.116926
split3_test_score                       

## References

```{bibliography} ./references.bib
:filter: docname in docnames
```