# Scikit-learn Processing Pipelines

## Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import f_classif, SelectKBest, f_regression
import sklearn.metrics as metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.get_dummies(['A', 'B', 'C', 'A', 'B', 'D'])

Unnamed: 0,A,B,C,D
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,1,0,0,0
4,0,1,0,0
5,0,0,0,1


In [3]:
# dataset
np.random.seed(42)
n_samples, n_features, n_features_info = 100, 5, 3
X = np.random.randn(n_samples, n_features)
beta = np.zeros(n_features)
beta[:n_features_info] = 1
Xbeta = np.dot(X, beta)
eps = np.random.randn(n_samples)
y = Xbeta + eps

X[:, 0] *= 1e6 # inflate the first feature
X[:, 1] += 1e6 # bias the second feature
y = 100 * y + 1000 # bias and scale the output

print("== Linear regression: scaling is not required ==")
model =lm.LinearRegression()
model.fit(X, y)
print("Coefficients:", model.coef_, model.intercept_)
print("Test R2:%.2f" % cross_val_score(estimator=model, X=X, y=y, cv=5).mean())

print('---------------------------------------')
print("== Lasso without scaling ==")
model = lm.LassoCV(cv=5)
model.fit(X, y)
print("Coefficients:", model.coef_, model.intercept_)
print("Test R2:%.2f" % cross_val_score(estimator=model, X=X, y=y, cv=5).mean())

print('---------------------------------------')
print("== Lasso with scaling ==")
model = lm.LassoCV(cv=5)
scaler = preprocessing.StandardScaler()
Xc = scaler.fit(X).transform(X)
model.fit(Xc, y)
print("Coefficients:", model.coef_, model.intercept_)
print("Test R2:%.2f" % cross_val_score(estimator=model, X=Xc, y=y, cv=5).mean())

== Linear regression: scaling is not required ==
Coefficients: [ 1.05421281e-04  1.13551103e+02  9.78705905e+01  1.60747221e+01
 -7.23145329e-01] -113550117.82706574
Test R2:0.77
---------------------------------------
== Lasso without scaling ==
Coefficients: [8.24902914e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00] 985.9940020104059
Test R2:0.10
---------------------------------------
== Lasso with scaling ==
Coefficients: [ 90.49136877 107.71407934  93.17561877  11.84506925  -0.        ] 982.3027936469599
Test R2:0.77


In [4]:
model = make_pipeline(preprocessing.StandardScaler(), lm.LassoCV(cv=5))

# or
from sklearn.pipeline import Pipeline
model = Pipeline([('standardscaler', preprocessing.StandardScaler()), ('lassocv', lm.LassoCV(cv=5))])

scores = cross_val_score(estimator=model, X=X, y=y, cv=5)
print("Test r2:%.2f" % scores.mean())

Test r2:0.77


In [5]:
np.random.seed(42)
n_samples, n_features, n_features_info = 100, 100, 3
X = np.random.randn(n_samples, n_features)
beta = np.zeros(n_features)
beta[:n_features_info] = 1
Xbeta = np.dot(X, beta)
eps = np.random.randn(n_samples)
y = Xbeta + eps

X[:, 0] *= 1e6 # inflate the first feature
X[:, 1] += 1e6 # bias the second feature
y = 100 * y + 1000 # bias and scale the output

model = Pipeline([('anova', SelectKBest(f_regression, k=3)), ('lm', lm.LinearRegression())])
scores = cross_val_score(estimator=model, X=X, y=y, cv=5)
print("Anova filter + linear regression, test r2:%.2f" % scores.mean())

model = Pipeline([('standardscaler', preprocessing.StandardScaler()), ('lassocv', lm.LassoCV(cv=5))])
scores = cross_val_score(estimator=model, X=X, y=y, cv=5)
print("Standardize + Lasso, test r2:%.2f" % scores.mean())

Anova filter + linear regression, test r2:0.72
Standardize + Lasso, test r2:0.64


## Regression pipelines with CV for parameters selection

In [6]:
# Datasets
n_samples, n_features, noise_sd = 100, 100, 20
X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=n_features, noise=noise_sd, 
                                      n_informative=5, random_state=42, coef=True)

# Use this to tune the noise parameter such that snr < 5
print("SNR:", np.std(np.dot(X, coef)) / noise_sd)


print("=============================")
print("== Basic linear regression ==")
print("=============================")

scores = cross_val_score(estimator=lm.LinearRegression(), X=X, y=y, cv=5)
print("Test r2:%.2f" % scores.mean())


print("==============================================")
print("== Scaler + anova filter + ridge regression ==")
print("==============================================")

anova_ridge = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('selectkbest', SelectKBest(f_regression)),
    ('ridge', lm.Ridge())
])

param_grid = {'selectkbest__k':np.arange(10, 110, 10),
              'ridge__alpha':[.001, .01, .1, 1, 10, 100]}


# Expect execution in ipython, for python remove the %time
print("----------------------------")
print("-- Parallelize inner loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, n_jobs=-1)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5)
print("Test r2:%.2f" % scores.mean())
print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, n_jobs=-1)
print("Test r2:%.2f" % scores.mean())


print("=====================================")
print("== Scaler + Elastic-net regression ==")
print("=====================================")
alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000]
l1_ratio = [.1, .5, .9]

print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")
enet = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('enet', lm.ElasticNet(max_iter=10000)),
])
param_grid = {'enet__alpha':alphas ,
'enet__l1_ratio':l1_ratio}
enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid)
%time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1)
print("Test r2:%.2f" % scores.mean())

print("-----------------------------------------------")
print("-- Parallelize outer loop + built-in CV--------")
print("-- Remark: scaler is only done on outer loop --")
print("-----------------------------------------------")
enet_cv = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas, cv=5)),
])
%time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5)
print("Test r2:%.2f" % scores.mean())

SNR: 3.2866820167645523
== Basic linear regression ==
Test r2:0.30
== Scaler + anova filter + ridge regression ==
----------------------------
-- Parallelize inner loop --
----------------------------
CPU times: user 10 s, sys: 5.49 s, total: 15.5 s
Wall time: 7.62 s
Test r2:0.86
----------------------------
-- Parallelize outer loop --
----------------------------
CPU times: user 33.2 ms, sys: 34.3 ms, total: 67.5 ms
Wall time: 9.08 s
Test r2:0.86
== Scaler + Elastic-net regression ==
----------------------------
-- Parallelize outer loop --
----------------------------
CPU times: user 10.6 ms, sys: 9.18 ms, total: 19.8 ms
Wall time: 4.63 s
Test r2:0.82
-----------------------------------------------
-- Parallelize outer loop + built-in CV--------
-- Remark: scaler is only done on outer loop --
-----------------------------------------------
CPU times: user 1.28 s, sys: 4.74 ms, total: 1.29 s
Wall time: 1.28 s
Test r2:0.82


## Classification pipelines with CV for parameters selection

In [8]:
# Datasets
n_samples, n_features, noise_sd = 100, 100, 20
X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, 
                                    n_informative=5, random_state=42)

def balanced_acc(estimator, X, y):
    '''Balanced acuracy scorer'''
    return metrics.recall_score(y, estimator.predict(X), average=None).mean()


print("=============================")
print("== Basic logistic regression ==")
print("=============================")
scores = cross_val_score(estimator=lm.LogisticRegression(C=1e8, class_weight='balanced', solver='lbfgs'), 
                         X=X, y=y, cv=5, scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())

print("=======================================================")
print("== Scaler + anova filter + ridge logistic regression ==")
print("=======================================================")
anova_ridge = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('selectkbest', SelectKBest(f_classif)),
    ('ridge', lm.LogisticRegression(penalty='l2', class_weight='balanced', solver='lbfgs'))
])
param_grid = {'selectkbest__k':np.arange(10, 110, 10), 
              'ridge__C':[.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]}

# Expect execution in ipython, for python remove the %time
print("----------------------------")
print("-- Parallelize inner loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, scoring=balanced_acc, n_jobs=-1)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())

print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, scoring=balanced_acc)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())


print("=======================================================")
print("== Scaler + anova filter + ridge logistic regression ==")
print("=======================================================")
anova_ridge = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('selectkbest', SelectKBest(f_classif)),
    ('ridge', lm.LogisticRegression(penalty='l2', class_weight='balanced', solver='lbfgs'))
])
param_grid = {'selectkbest__k':np.arange(10, 110, 10), 
              'ridge__C':[.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]}

# Expect execution in ipython, for python remove the %time
print("----------------------------")
print("-- Parallelize inner loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, scoring=balanced_acc, n_jobs=-1)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())

print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")

anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid,
                              scoring=balanced_acc)
%time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())



print("========================================")
print("== Scaler + lasso logistic regression ==")
print("========================================")
Cs = np.array([.0001, .001, .01, .1, 1, 10, 100, 1000, 10000])
alphas = 1 / Cs
l1_ratio = [.1, .5, .9]


print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")

lasso = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('lasso', lm.LogisticRegression(penalty='l1', class_weight='balanced')),
])
param_grid = {'lasso__C':Cs}
enet_cv = GridSearchCV(lasso, cv=5, param_grid=param_grid, scoring=balanced_acc)
%time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())


print("-----------------------------------------------")
print("-- Parallelize outer loop + built-in CV--------")
print("-- Remark: scaler is only done on outer loop --")
print("-----------------------------------------------")

lasso_cv = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('lasso', lm.LogisticRegressionCV(Cs=Cs)),
])
%time scores = cross_val_score(estimator=lasso_cv, X=X, y=y, cv=5)
print("Test bACC:%.2f" % scores.mean())


print("=============================================")
print("== Scaler + Elasticnet logistic regression ==")
print("=============================================")

print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")
enet = Pipeline([('standardscaler', preprocessing.StandardScaler()),
                 ('enet', lm.SGDClassifier(loss="log", penalty="elasticnet",
                                           alpha=0.0001, l1_ratio=0.15, class_weight='balanced')),
])
param_grid = {'enet__alpha':alphas,'enet__l1_ratio':l1_ratio}
enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid, scoring=balanced_acc)
%time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,scoring=balanced_acc)
print("Test bACC:%.2f" % scores.mean())

== Basic logistic regression ==
Test bACC:0.76
== Scaler + anova filter + ridge logistic regression ==
----------------------------
-- Parallelize inner loop --
----------------------------
CPU times: user 20.3 s, sys: 8.49 ms, total: 20.3 s
Wall time: 20.4 s
Test bACC:0.78
----------------------------
-- Parallelize outer loop --
----------------------------
CPU times: user 21.3 s, sys: 6.9 ms, total: 21.3 s
Wall time: 21.3 s
Test bACC:0.78
== Scaler + anova filter + ridge logistic regression ==
----------------------------
-- Parallelize inner loop --
----------------------------
CPU times: user 20.5 s, sys: 0 ns, total: 20.5 s
Wall time: 20.5 s
Test bACC:0.78
----------------------------
-- Parallelize outer loop --
----------------------------
CPU times: user 20.8 s, sys: 3.57 ms, total: 20.8 s
Wall time: 20.8 s
Test bACC:0.78
== Scaler + lasso logistic regression ==
----------------------------
-- Parallelize outer loop --
----------------------------
CPU times: user 1.16 s, sys: 