In [59]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, TimeSeriesSplit, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression

### Define a regression problem

In [60]:
X, y = make_regression(n_samples=100, n_features=3, n_informative=1, n_targets=1, bias=5., noise=5.)

In [61]:
X.shape, y.shape

((100, 3), (100,))

### Build a simple pipeline

In [62]:
model = Pipeline([('scaler', StandardScaler()),
                  ('estimator', LinearRegression())])

### Define cross-validations methods

In [123]:
kfoldcv = KFold(n_splits=5)

tscv = TimeSeriesSplit(n_splits=5)

### Run *cross_validation_score* with **KFold CV**

In [64]:
scores = cross_val_score(estimator=model, X=X, y=y, scoring='r2', cv=kfoldcv, n_jobs=-1, verbose=1)

print(f"SCORES: {scores}")
print(f"AVG SCORE: {np.mean(scores):.2f} +- {np.std(scores):.2f}")

SCORES: [0.98424407 0.98396732 0.99202809 0.99122747 0.99152061]
AVG SCORE: 0.99 +- 0.00


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


### Run *cross_validation_score* with **TimeSeriesSplit CV**

In [70]:
scores = cross_val_score(estimator=model, X=X, y=y, scoring='r2', cv=tscv, n_jobs=-1, verbose=1)

print(f"SCORES: {scores}")
print(f"AVG SCORE: {np.mean(scores):.2f} +- {np.std(scores):.2f}")

SCORES: [0.96015185 0.99052785 0.99095546 0.98974145 0.99179943]
AVG SCORE: 0.98 +- 0.01


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


### Run *cross_validation_predict* with **KFold CV**

In [75]:
predictions = cross_val_predict(estimator=model, X=X, y=y, cv=kfoldcv, n_jobs=-1, verbose=1)

print(len(predictions))

100


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


### Run *cross_validation_predict* with **TimeSeriesSplit CV**

In [80]:
predictions = cross_val_predict(estimator=model, X=X, y=y, cv=tscv, n_jobs=-1, verbose=1)

print(len(predictions))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished


ValueError: ignored

Original sklearn cross_val_predict doesn't work with TimeSeriesSplit cross-validation, we need to implement our own function that handles our needs

In [124]:
def cross_val_predict2(estimator, X, y, cv, method='predict', verbose=0):
    from sklearn.base import clone
    predictions = []
    for i, (train, test) in enumerate(cv.split(X, y)):
        if verbose:
            print(f'cv-iteration {i+1}/{cv.n_splits}')
        estimator2 = clone(estimator)
        estimator2.fit(X[train], y[train])
        predictions.extend(getattr(estimator2, method)(X[test]))

    return np.array(predictions)

In [125]:
predictions = cross_val_predict2(estimator=model, X=X, y=y, cv=tscv, verbose=1)

print(len(predictions))

cv-iteration 1/5
cv-iteration 2/5
cv-iteration 3/5
cv-iteration 4/5
cv-iteration 5/5
80


### Test if this method still works for other cv methods



In [126]:
pred1 = cross_val_predict(estimator=model, X=X, y=y, cv=kfoldcv, n_jobs=-1, verbose=0)
pred2 = cross_val_predict2(estimator=model, X=X, y=y, cv=kfoldcv, verbose=0)

assert np.allclose(pred1, pred2)

In [127]:
print(pred1[:10])
print(pred2[:10])

[ 10.59601114 -40.71473302  22.16841067  28.40303972  32.06705469
 -60.41445802  35.44529266  13.88954642 -34.04926085 -26.01019357]
[ 10.59601114 -40.71473302  22.16841067  28.40303972  32.06705469
 -60.41445802  35.44529266  13.88954642 -34.04926085 -26.01019357]
