http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

# cross_val_score

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import datasets 

iris = datasets.load_iris()
clf = SVC(kernel='linear', C=1)

scores = cross_val_score(clf, iris.data, iris.target, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [8]:
scores.mean(), scores.std()

(0.9800000000000001, 0.016329931618554516)

In [10]:
scores = cross_val_score(clf, iris.data, iris.target, cv=5, scoring='f1_macro')
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

In [12]:
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
print(n_samples)
cv = ShuffleSplit(n_splits=3, test_size=.3, random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=cv)

150


array([0.97777778, 0.97777778, 1.        ])

In [21]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data ,iris.target,
                                                    test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
# or
# X_train_transformed = preprocessing.StandardScaler().fit_transform(X_train)
# だが、test setもtraining setと同じように標準化したいので、同じデータにfitさせた
# scalerを元にtransformさせないと駄目。
clf = SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

0.9333333333333333

# cross_validate

In [28]:
# cross_validate

from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, iris.data, iris.target,
                       scoring=scoring, cv=5, return_train_score=False)
scores

{'fit_time': array([0.00048685, 0.0003674 , 0.00026011, 0.00030422, 0.00027204]),
 'score_time': array([0.00095105, 0.00067925, 0.00063014, 0.00073957, 0.00066805]),
 'test_precision_macro': array([0.96969697, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])}

In [30]:
from sklearn.metrics.scorer import make_scorer
scoring = {'prec_macro': 'precision_macro',
          'rec_micro': make_scorer(recall_score, average='macro')}
scores = cross_validate(clf, iris.data, iris.target,
                       scoring=scoring, cv=5, return_train_score=True)
scores

{'fit_time': array([0.00068212, 0.00051451, 0.00045586, 0.00052595, 0.00044179]),
 'score_time': array([0.00129223, 0.00111127, 0.00113058, 0.00111961, 0.00118947]),
 'test_prec_macro': array([0.96969697, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_rec_micro': array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ]),
 'train_prec_macro': array([0.97674419, 0.97674419, 0.99186992, 0.98412698, 0.98333333]),
 'train_rec_micro': array([0.975     , 0.975     , 0.99166667, 0.98333333, 0.98333333])}

In [31]:
scores = crboss_validate(clf, iris.data, iris.target,
                       scoring='precision_macro')
scores

{'fit_time': array([0.00048423, 0.00031447, 0.00027585]),
 'score_time': array([0.00039864, 0.00042534, 0.00033927]),
 'test_score': array([1.        , 0.96491228, 0.98039216]),
 'train_score': array([0.98095238, 1.        , 0.99047619])}

# cross_val_predict

In [33]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
metrics.accuracy_score(iris.target, predicted)


0.9733333333333334

In [34]:
predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# K-fold

In [36]:
import numpy as np
from sklearn.model_selection import KFold

X = ['a', 'b', 'c', 'd']
kf = KFold(n_splits = 2)
for train, test in kf.split(X):
    print(train, test)

[2 3] [0 1]
[0 1] [2 3]


In [39]:
train, test

(array([0, 1]), array([2, 3]))

In [43]:
X = np.array([[0., 0.], [1., 1.], [-1., -1.], [2., 2.]])
y = np.array([0, 1, 0, 1])
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_train, X_test, y_train, y_test

(array([[0., 0.],
        [1., 1.]]), array([[-1., -1.],
        [ 2.,  2.]]), array([0, 1]), array([0, 1]))

# Repeated K-Fold

In [44]:
from sklearn.model_selection import RepeatedKFold

X = np.array([[1,2], [3,4], [1,2],[3,4]])

random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)

for train, test in rkf.split(X):
    print(train, test)

[2 3] [0 1]
[0 1] [2 3]
[0 2] [1 3]
[1 3] [0 2]


# Leave One Out(LOO)

In [45]:
from sklearn.model_selection import LeaveOneOut

X = [1, 2, 3, 4]
loo = LeaveOneOut()

for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


# Leave P Out (LPO)

In [47]:
from sklearn.model_selection import LeavePOut

X = np.ones(4)
lpo = LeavePOut(p=2)
for train, test in lpo.split(X):
    print("%s %s" % (train, test))


[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]
