In [107]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
from sklearn.utils import check_random_state
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.utils import shuffle

In [128]:
n = 20
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-10, 10, size=(n,)) + 50. * np.log(1 + np.arange(n))
x = x.reshape(-1, 1)
y = y.reshape(-1, 1)

In [129]:
pf = PolynomialFeatures()
lr = LinearRegression()
pl = Pipeline([('pf', pf), ('lr', lr)])
x, y = shuffle(x, y)
x_tr, x_test, y_tr, y_test = train_test_split(x, y, test_size=0.33)
n_folds = 5
degrees = range(0, 10)
kf = KFold(x_tr.shape[0], n_folds)
tr_scores = np.zeros((n_folds, len(degrees)))
cv_scores = np.zeros((n_folds, len(degrees)))
for i, (tr_ind, cv_ind) in enumerate(kf):
    for j, d in enumerate(degrees):
        pl.set_params(pf__degree = d).fit(x_tr[tr_ind], y_tr[tr_ind])
        tr_scores[i, j] = pl.score(x_tr[tr_ind], y_tr[tr_ind])
        cv_scores[i, j] = pl.score(x_tr[cv_ind], y_tr[cv_ind])
tr_scores = tr_scores.mean(axis=0)
cv_scores = cv_scores.mean(axis=0)

In [130]:
cv_scores

array([ -1.54396159e+00,   3.95091101e-01,   7.50568551e-01,
         6.76287009e-01,   6.89717985e-01,   7.35884690e-01,
         1.58794618e-01,   8.37294314e-01,  -1.24330794e+02,
        -1.04037600e+03])

In [131]:
d_ = np.argmax(cv_scores)
pl.set_params(pf__degree = d_)
pl.fit(x_tr, y_tr)
score_ = pl.score(x_test, y_test)

In [132]:
plt.plot(degrees, cv_scores, 'r-')
plt.plot(degrees, tr_scores, 'b-')
plt.scatter(d_, score_)
plt.grid(True)
plt.show()

In [121]:
plt.plot(x, y, 'r.')
plt.grid(True)
plt.show()


## Pipeline

In [1]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
# generate some data to play with
X, y = samples_generator.make_classification(
    n_informative=5, n_redundant=0, random_state=42)
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])

Pipeline is a cascade of transforms with a final estimator.

Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit.

Pipeline([('tf1', tf1), ('tf2', tf2), ('est', est)]).fit(X, y) does the following:
tf1.fit(X, y)
X1 = tf1.transform(X)
(if tf1 implements fit_transform then X1 = tf1.fit_transform(X, y))

{ t.fit_transform(X, y, \*\*fit_params) = t.fit(X, y, \*\*fit_params).transform(X) }

tf2.fit(X1, y)
X2 = tf2.transform(X)

est.fit(X2, y)


Applying fit with y to tfx such as tfx.fit(X, y) may not make sense because y is the final outcome, but at least in some cases tfx.fit(X, y) ignores y and only fits to X, e.g., CounterVectorizer which learn the dictionay from only X.

Prediction step normally does not involve fitting, and it applies to a fitted transfrom. In this case pipeline.predict(X) does est.predict(tf2.transform(tf1.transform(X))), i.e., it applies the transforms to X (without fitting) and the calls est.predict on the result.



### Estimator API summary:
* est.fit(X, y): fit estimator to data X, y. Return the fitted estimator (internal parameters are fitted).
* est.transform(X): transforms X to another matrix (this matrix is not necessarily similar to y, e.g., in CountVectorizer)
* est.fit_transform(X, y) = est.fit(X, y).transform(X)
* est.predict(X, y): transform X to something similar to y!