# Pipelines

In [16]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Loading data

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [2]:
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, train_size=0.7)

In [4]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

#### Creating the pipeline

In [7]:
from sklearn.pipeline import Pipeline

In [8]:
pipe = Pipeline([
    ("scaler" , StandardScaler()),
    ("reduce_dim", PCA()),
    ("regressor", Ridge())
])

In [9]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

#### assessing results

In [10]:
pipe.score(X_test, y_test)

0.7237123624022682

In [17]:
pipe.steps

# accessing attributes via the pipe!
pipe.steps[1][1].explained_variance_

[('scaler', StandardScaler()), ('reduce_dim', PCA()), ('regressor', Ridge())]

array([5.88988292, 1.48909432, 1.32200253, 0.87055315, 0.83255315,
       0.68255086, 0.57862458, 0.42419237, 0.29696508, 0.23691306,
       0.18240998, 0.16095158, 0.07013363])

## Hyperparameter Tuning

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {"reduce_dim__n_components": range(1,11),
              "regressor__alpha": [n**2 for n in range(-6, 6)],
             }

In [30]:
gridsearch = GridSearchCV(pipe, param_grid, verbose=0)
gridsearch.fit(X_train, y_train)
gridsearch.best_params_

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('reduce_dim', PCA()),
                                       ('regressor', Ridge())]),
             param_grid={'reduce_dim__n_components': range(1, 11),
                         'regressor__alpha': [36, 25, 16, 9, 4, 1, 0, 1, 4, 9,
                                              16, 25]})

{'reduce_dim__n_components': 8, 'regressor__alpha': 16}

#### Extreme Hyperparameter Tuning!!

In [40]:
# using a list of dictionaries to try out different algorithms each with its own range of hyperparameters to try out
param_grid = [
             {"scaler": [StandardScaler(), RobustScaler(), QuantileTransformer(n_quantiles=200)],
              "reduce_dim": [PCA()],
              "reduce_dim__n_components": range(1,11),
              "regressor__alpha": [n**2 for n in range(-6, 6)]
             },
             {"scaler": [StandardScaler(), RobustScaler(), QuantileTransformer(n_quantiles=200)],
              "reduce_dim": [SelectKBest(f_regression)],
              "reduce_dim__k": range(1,11),
              "regressor__alpha": [n**2 for n in range(-6, 6)]
             }
             ]

In [41]:
gridsearch = GridSearchCV(pipe, param_grid, verbose=0)
gridsearch.fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('reduce_dim', PCA()),
                                       ('regressor', Ridge())]),
             param_grid=[{'reduce_dim': [PCA(n_components=9)],
                          'reduce_dim__n_components': range(1, 11),
                          'regressor__alpha': [36, 25, 16, 9, 4, 1, 0, 1, 4, 9,
                                               16, 25],
                          'scaler': [StandardScaler(), RobustScaler(),
                                     QuantileTransformer(n_quantiles=200)]},
                         {'reduce_dim': [SelectKBest(score_func=<function f_regression at 0x7f9efa5d1790>)],
                          'reduce_dim__k': range(1, 11),
                          'regressor__alpha': [36, 25, 16, 9, 4, 1, 0, 1, 4, 9,
                                               16, 25],
                          'scaler': [StandardScaler(), RobustScaler(),
                

Final score is:  0.6879329603141091


In [42]:
gridsearch.best_params_

{'reduce_dim': PCA(n_components=9),
 'reduce_dim__n_components': 9,
 'regressor__alpha': 1,
 'scaler': RobustScaler()}

# Combining pipelines   
### (Feature Union)

In [43]:
from sklearn.datasets import load_iris

In [44]:
iris = load_iris()
X = iris.data
y = iris.target

In [46]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

In [47]:
pca = PCA(n_components=2)
kbest = SelectKBest(k=3)

In [57]:
combined_features = FeatureUnion([("pca", pca), ("univ_selection", kbest)])
combined_features

FeatureUnion(transformer_list=[('pca', PCA(n_components=2)),
                               ('univ_selection', SelectKBest(k=3))])

In [49]:
# We will initialize the classifier
svm = SVC(kernel="linear")

In [50]:
# create pipeline
pipeline = Pipeline([("features", combined_features),
                     ("svm", svm)
                    ])

In [51]:
# set param_grid
param_grid = {"features__pca__n_components": [1,2,3],
              "features__univ_selection__k": [1,2,3],
              "svm__C": [0.1,1,10]
             }

In [55]:
gridsearch = GridSearchCV(pipeline, param_grid, verbose=1, refit=True)
gridsearch.fit(X, y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('pca',
                                                                        PCA(n_components=2)),
                                                                       ('univ_selection',
                                                                        SelectKBest(k=3))])),
                                       ('svm', SVC(kernel='linear'))]),
             param_grid={'features__pca__n_components': [1, 2, 3],
                         'features__univ_selection__k': [1, 2, 3],
                         'svm__C': [0.1, 1, 10]},
             verbose=1)

In [56]:
gridsearch.best_params_

{'features__pca__n_components': 2,
 'features__univ_selection__k': 3,
 'svm__C': 1}