# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets
import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [5]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [8]:
train_data[:5], test_data[:5]

(array([[ 6.4,  2.7,  5.3,  1.9],
        [ 6.9,  3.2,  5.7,  2.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 6.2,  2.8,  4.8,  1.8]]), array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 6.9,  3.1,  4.9,  1.5],
        [ 6.7,  3. ,  5.2,  2.3],
        [ 6.3,  3.3,  4.7,  1.6]]))

In [14]:
print(train_labels, '\n', test_labels)

[2 2 0 0 2 2 1 2 2 0 2 0 0 2 2 1 0 2 0 1 1 1 1 1 1 0 0 1 1 1 1 1 2 1 1 2 1
 2 0 0 2 0 0 2 2 2 1 0 0 2 0 1 1 2 2 2 2 2 1 0 0 0 1 1 0 2 0 0 2 2 0 1 1 2
 0 0 0 0 0 1 2 2 0 1 2 1 1 0 0 1 1 1 2 0 0 1 2 1 2 0 2 2 1 2 1] 
 [0 0 1 2 1 0 0 0 2 2 1 2 0 1 1 2 1 1 0 1 0 1 1 0 1 0 0 1 1 2 2 2 1 2 0 2 2
 0 0 0 0 2 1 2 2]


array([0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 1, 2, 0, 1, 1, 2, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2, 0, 2, 2, 0, 0, 0, 0, 2, 1, 2, 2])

### Стратегии проведения кросс-валидации

In [15]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)

#### KFold

In [17]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [18]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[0 1 3 6 7] [2 4 5 8 9]
[2 4 5 8 9] [0 1 3 6 7]


In [24]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [28]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [40]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[2 3 4 5] [0 1 6 7 8 9]
[0 1 6 7 8 9] [2 3 4 5]


#### ShuffleSplit

In [43]:
ss = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[0 7 1 5 2 3 6 9] [4 8]
[6 5 1 3 4 0 9 2] [7 8]
[2 6 5 9 1 0 3 4] [7 8]
[8 7 3 6 2 5 9 4] [0 1]
[4 9 8 5 7 0 2 1] [3 6]


#### StratifiedShuffleSplit

In [49]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[4 6 8 7 1 9 3 2] [0 5]
[2 1 9 8 7 0 5 4] [6 3]
[6 5 9 3 0 4 1 8] [7 2]
[3 2 9 1 8 5 7 4] [6 0]


#### Leave-One-Out

In [54]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators