# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [4]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [6]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [7]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[7.2 3.  5.8 1.6]
 [6.9 3.2 5.7 2.3]
 [5.4 3.7 1.5 0.2]
 [5.1 3.8 1.6 0.2]
 [5.3 3.7 1.5 0.2]]


Тестовая выборка:
 [[5.6 3.  4.1 1.3]
 [7.7 2.8 6.7 2. ]
 [7.  3.2 4.7 1.4]
 [5.9 3.  4.2 1.5]
 [6.7 3.3 5.7 2.1]]


In [8]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [2 2 0 0 0 2 2 1 1 2 0 0 1 1 2 2 1 0 0 2 0 0 1 1 0 0 0 0 2 1 1 2 1 2 2 0 1
 2 1 0 2 1 2 2 1 1 0 2 0 0 0 1 0 1 2 1 1 2 2 1 2 1 0 1 1 1 0 0 0 0 0 0 1 2
 0 0 1 2 2 1 1 2 0 2 2 0 1 2 0 1 1 2 1 0 2 1 1 2 2 0 1 2 1 2 0]


Метки классов на тестовой выборке:
 [1 2 1 1 2 1 2 1 1 2 1 0 0 1 2 1 2 0 2 0 0 2 0 1 1 1 0 0 0 2 0 2 2 2 2 2 0
 0 1 2 1 2 0 0 0]


### Стратегии проведения кросс-валидации

In [13]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,50)

#### KFold

In [11]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [12]:
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [14]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[0 2 6 8 9] [1 3 4 5 7]
[1 3 4 5 7] [0 2 6 8 9]


In [15]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [17]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [None]:
target = np.array([0, 1] * 5)
print target

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print train_indices, test_indices

#### ShuffleSplit

In [14]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[ 7 33 21 48 30 12 26 13 24 39  5  0 23 10  3  9 49 19 22 20 44 15 31 38
 42 45 41  8 16 40 27 47 14 46 17 43 25 18 11  4] [ 2 28 34  6  1 35 29 32 37 36]
[17 47  6 20 38  7 11 28 22  9 37 23 30  2  3 18 43 36 25 26 21  8 40 39
 34 44 42 41 24 16 31  1 46 33 49 19 29  4 35 13] [ 0 12 45 14 10 27 32 48 15  5]
[49 18 21 33 44  0  4 12 22 19  8  2 36 39 40 26 38 15  5  3 42 34 35 16
 17 10 47 31 14  6 41 43 46 45 48 37 29 30  7 27] [24 23 11 25  9 32 13 20 28  1]
[ 4  1  3 21 11 28  7 33 22 35 31 44 39 17  2 13 45 40 49 48  8 19 23 18
 14 29 27 25 12  6 20 36 43 46 26 41 37 24 16 10] [42  0 15  5 30 47  9 34 38 32]
[29 20 12 21 37 17 33 10 26 43  0 16 27  7  8 46  5  3 36  1 13 40 11  6
 22  9  4 45  2 49 32 34 39 35 18 14 28 31 30 44] [42 41 23 19 47 25 24 48 15 38]
[ 2  5 18 39 11 10 30 25 48 14 35 23  1 24 33 28 49  3 31 12 41 32 46 13
 16 27 44 38 36 20 45 37 47 19 34  6 29  7 40 17] [ 4 26 21 15 43  8  9  0 42 22]
[14 35 18 30 12 36 33 25 27 38  2 19  5  3 40 13 39 49 17 32 15  7  8 

#### StratifiedShuffleSplit

In [12]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[6 0 4 3 1 5 9 8] [7 2]
[0 1 2 8 5 9 7 3] [4 6]
[6 2 0 5 7 3 1 9] [4 8]
[8 1 4 2 5 6 0 9] [7 3]


#### Leave-One-Out

In [22]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators