**Корректность проверена на Python 3.6:**
+ numpy 1.15.4
+ sklearn 0.20.2

# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [2]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [3]:
iris = datasets.load_iris()

In [4]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [5]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [6]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [7]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[5.8 4.  1.2 0.2]
 [5.4 3.4 1.7 0.2]
 [6.5 3.  5.2 2. ]
 [5.8 2.7 5.1 1.9]
 [5.5 2.4 3.7 1. ]]


Тестовая выборка:
 [[6.7 3.3 5.7 2.1]
 [6.1 3.  4.6 1.4]
 [6.4 2.8 5.6 2.2]
 [5.6 2.9 3.6 1.3]
 [5.7 2.5 5.  2. ]]


In [8]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [0 0 2 2 1 2 0 0 2 1 0 1 0 0 0 2 2 1 1 1 2 2 1 2 1 2 2 0 0 1 2 2 1 0 0 1 2
 0 1 1 0 0 0 2 0 1 2 2 0 0 1 1 0 0 0 0 1 2 0 0 1 2 0 2 1 2 0 2 0 1 0 2 2 1
 1 0 1 1 1 1 2 2 0 2 2 0 2 1 0 1 2 2 0 1 1 2 0 2 0 2 0 1 1 1 1]


Метки классов на тестовой выборке:
 [2 1 2 1 2 0 0 0 0 2 2 0 2 1 1 2 1 0 2 1 2 0 2 2 0 0 2 1 1 2 2 2 1 1 0 1 1
 0 1 2 1 0 1 1 0]


### Стратегии проведения кросс-валидации

In [9]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)

#### KFold

In [10]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [11]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 2 4 5 8] [0 3 6 7 9]
[0 3 6 7 9] [1 2 4 5 8]


In [12]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [13]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 6 8 9] [0 1 2 5 7]
[0 1 2 5 7] [3 4 6 8 9]


In [14]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[1 4 5 7 8] [0 2 3 6 9]
[0 2 3 6 9] [1 4 5 7 8]


#### ShuffleSplit

In [21]:
X = range(0,50)
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 10)
print(target)
i = 1
for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)
    print(i, len(train_indices), len(test_indices))
    i+=1

[0 0 0 0 0 1 1 1 1 1]
[ 3  1 39 22 12 46 44  7 17  2  9 35 41 49 28 30  4 36 19 21 26  5 34 23
 20 43 40 16 32 25 11 42 48 31 27 37 29 38 45 24] [15 14 13 18 47  6  0 33 10  8]
1 40 10
[29 47 38 10 28 40 18 30 46  8 37 35  4  5 32 39 14 43  0 33 25 17 36 31
 13  2  1  6 42 44 20 11 15 45  7 12 19 23 24 48] [21 49 26 22 34  3  9 16 27 41]
2 40 10
[28 14 48 44 27 22 35 16 36 47 25  1 37 42 19 43 17 46  3 34 33 26  6 32
 13 12 24 39 45  8 15  2  0 11  7  9 31 49 30 20] [ 5 23 29 10  4 41 40 18 21 38]
3 40 10
[12 16 45 31 40 20 33 36 30 32 41  7 38  1 35 29 19 11 28 17 48 34  5 25
 13  0 47  6 46 14  8  3 37 43 24 27 39 22 44 15] [26 21 10 49 18 23  9  4 42  2]
4 40 10
[ 1 28 21 35 44 39 32 15 46 26 25 34 13  6 19 42  3 24  9 41  5 30 23 37
  4 16  8 10 33 18 48 14 17 20 43 22 36 49  2 11] [29 27  0  7 45 47 38 40 12 31]
5 40 10
[43 39 11 18 16 46 31 12 13 44  7 24 42  5  9 10 32 49  2  6 29 23 47 41
 48 40 15 26 34 35 25 22  3  8 28 36  1 37 38 14] [33 20 19 27 30  4 17 45  0 21]
6 40 10


#### StratifiedShuffleSplit

In [18]:
target = np.array([0] * 5 + [1] * 5)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]


ValueError: Found input variables with inconsistent numbers of samples: [50, 10]

#### Leave-One-Out

In [None]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators