In [107]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
from sklearn.utils import check_random_state
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.utils import shuffle

In [128]:
n = 20
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-10, 10, size=(n,)) + 50. * np.log(1 + np.arange(n))
x = x.reshape(-1, 1)
y = y.reshape(-1, 1)

In [129]:
pf = PolynomialFeatures()
lr = LinearRegression()
pl = Pipeline([('pf', pf), ('lr', lr)])
x, y = shuffle(x, y)
x_tr, x_test, y_tr, y_test = train_test_split(x, y, test_size=0.33)
n_folds = 5
degrees = range(0, 10)
kf = KFold(x_tr.shape[0], n_folds)
tr_scores = np.zeros((n_folds, len(degrees)))
cv_scores = np.zeros((n_folds, len(degrees)))
for i, (tr_ind, cv_ind) in enumerate(kf):
    for j, d in enumerate(degrees):
        pl.set_params(pf__degree = d).fit(x_tr[tr_ind], y_tr[tr_ind])
        tr_scores[i, j] = pl.score(x_tr[tr_ind], y_tr[tr_ind])
        cv_scores[i, j] = pl.score(x_tr[cv_ind], y_tr[cv_ind])
tr_scores = tr_scores.mean(axis=0)
cv_scores = cv_scores.mean(axis=0)

In [130]:
cv_scores

array([ -1.54396159e+00,   3.95091101e-01,   7.50568551e-01,
         6.76287009e-01,   6.89717985e-01,   7.35884690e-01,
         1.58794618e-01,   8.37294314e-01,  -1.24330794e+02,
        -1.04037600e+03])

In [131]:
d_ = np.argmax(cv_scores)
pl.set_params(pf__degree = d_)
pl.fit(x_tr, y_tr)
score_ = pl.score(x_test, y_test)

In [132]:
plt.plot(degrees, cv_scores, 'r-')
plt.plot(degrees, tr_scores, 'b-')
plt.scatter(d_, score_)
plt.grid(True)
plt.show()

In [121]:
plt.plot(x, y, 'r.')
plt.grid(True)
plt.show()


## Pipeline

In [1]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
# generate some data to play with
X, y = samples_generator.make_classification(
    n_informative=5, n_redundant=0, random_state=42)
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])

Pipeline is a cascade of transforms with a final estimator.

Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit.

Pipeline([('tf1', tf1), ('tf2', tf2), ('est', est)]).fit(X, y) does the following:
tf1.fit(X, y)
X1 = tf1.transform(X)
(if tf1 implements fit_transform then X1 = tf1.fit_transform(X, y))

{ t.fit_transform(X, y, \*\*fit_params) = t.fit(X, y, \*\*fit_params).transform(X) }

tf2.fit(X1, y)
X2 = tf2.transform(X)

est.fit(X2, y)


Applying fit with y to tfx such as tfx.fit(X, y) may not make sense because y is the final outcome, but at least in some cases tfx.fit(X, y) ignores y and only fits to X, e.g., CounterVectorizer which learn the dictionay from only X.

Prediction step normally does not involve fitting, and it applies to a fitted transfrom. In this case pipeline.predict(X) does est.predict(tf2.transform(tf1.transform(X))), i.e., it applies the transforms to X (without fitting) and the calls est.predict on the result.



### Estimator API summary:
* est.fit(X, y): fit estimator to data X, y. Return the fitted estimator (internal parameters are fitted).
* est.transform(X): transforms X to another matrix (this matrix is not necessarily similar to y, e.g., in CountVectorizer)
* est.fit_transform(X, y) = est.fit(X, y).transform(X)
* est.predict(X, y): transform X to something similar to y!

## Cross Validations

In [58]:
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
iris.data.shape, iris.target.shape


((150, 4), (150,))

In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.4, random_state=0)

X_train.shape, y_train.shape

X_test.shape, y_test.shape


clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

clf.score(X_test, y_test)                           


0.96666666666666667

In [114]:
# Side note: An example implementation of train_test_split
# Note: The orig implementation uses iterators and generators. 
#   Maybe better to use those.
import numpy as np
def train_test_split(*arrays, **options):
        test_size = options.pop('test_size', None)
        if test_size is None:
            test_size = .1
        if len(options) != 0:
            raise TypeError("Invalid parameters passed: %s" % str(options))
        if len(arrays) == 0:
            raise ValueError("At least one array required as input")
        
        arrays_ = []
        m = arrays[0].shape[0]
        for a in arrays:
            if a.shape[0] != m:
                raise ValueError("All arrays must have same first dimension")
            if len(a.shape) == 1:
                arrays_.append(np.atleast_2d(a).T)
            else:
                arrays_.append(a)
                
        p = np.random.permutation(m)
        test_ind_max = int(test_size * m)
        test_ind = p[:test_ind_max]
        train_ind = p[test_ind_max:]
        l = []
        for a in arrays_:
            # Note: a[r1:r2] makes a copy, so modifying the output of this function
            # does not affect the original arrays
            l.extend([a[train_ind, :], a[test_ind, :]])
        return l
    


In [101]:
#[i for i in train_test_split(iris.data, iris.target)]
x = np.array([[1, 2, 3], [4, 5, 6]])
x1 = [i for i in train_test_split(x, test_size = .5)]

The simplest way to use cross-validation is to call the cross_val_score helper function on the estimator and the dataset.

The following example demonstrates how to estimate the accuracy of a linear kernel support vector machine on the iris dataset by splitting the data, fitting a model and computing the score 5 consecutive times (with different splits each time):

In [116]:
from sklearn.cross_validation import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
scores                                              


array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])

It is also possible to use other cross validation strategies by passing a cross validation **iterable** instead, for instance:

In [120]:
from sklearn.cross_validation import ShuffleSplit
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_samples, test_size=0.3, random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([ 0.97777778,  0.97777778,  1.        ,  0.95555556,  1.        ,
        0.97777778,  0.97777778,  1.        ,  0.97777778,  0.97777778])

In [126]:
from sklearn.cross_validation import cross_val_predict
from sklearn import metrics
predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
print metrics.accuracy_score(iris.target, predicted) 
np.mean(cross_val_score(clf, iris.data, iris.target, cv=10))

0.973333333333


0.97333333333333338

**Cross Validation Iterable:**
* KFold
* LeaveOneOut
* ShuffleSplit (in later versions)
* StratifiedKFold
* GroupKFold

## Tuning the hyper-parameters of an estimator

Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc.


Any parameter provided when constructing an estimator may be optimized in this manner. Specifically, to find the names and current values for all parameters for a given estimator, use: estimator.get_params()

A search consists of:

* an estimator (regressor or classifier such as sklearn.svm.SVC());
* a parameter space;
* a method for searching or sampling candidates;
* a cross-validation scheme; and
* a score function.

Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, **GridSearchCV** exhaustively considers all parameter combinations, while **RandomizedSearchCV** can sample a given number of candidates from a parameter space with a specified distribution.

In [151]:
from sklearn import svm, datasets
from sklearn.grid_search import GridSearchCV
import pandas as pd
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[.1, 10]}
svr = svm.SVC()
clf = GridSearchCV(svr, parameters)
clf.fit(iris.data, iris.target)
clf.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
def GridSearchCV_(estimator, params_grid):
    
    grid_search_dims = [len(v) for v in param_grid.values]
    res = np.zeros(*grid_search_dims)
    np.ndenumerate(res)