# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [25]:
from sklearn import datasets
from sklearn import model_selection 
import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [32]:
iris = datasets.load_iris()

In [33]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [34]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [35]:
print ('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [36]:
print ('Обучающая выборка:\n', train_data[:5])
print ('\n')
print ('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[6.9 3.2 5.7 2.3]
 [6.  2.2 4.  1. ]
 [5.  3.5 1.3 0.3]
 [5.1 3.8 1.9 0.4]
 [4.4 3.  1.3 0.2]]


Тестовая выборка:
 [[6.7 3.1 4.7 1.5]
 [4.5 2.3 1.3 0.3]
 [6.4 3.2 4.5 1.5]
 [4.9 3.1 1.5 0.2]
 [5.7 2.5 5.  2. ]]


In [37]:
print ('Метки классов на обучающей выборке:\n', train_labels)
print ('\n')
print ('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [2 1 0 0 0 0 1 0 2 1 0 2 2 1 2 0 1 1 2 2 1 1 0 1 1 0 2 2 0 1 0 1 2 0 1 1 0
 0 1 2 1 1 0 0 0 0 2 0 2 2 1 2 1 1 2 2 1 1 1 0 1 2 0 0 0 2 1 2 2 2 2 0 0 2
 2 1 2 2 2 1 2 1 0 1 1 0 1 1 1 2 1 1 2 2 2 0 0 0 1 0 1 0 1 0 2]


Метки классов на тестовой выборке:
 [1 0 1 0 2 2 1 0 1 0 2 2 0 2 2 2 0 2 1 0 2 1 2 1 2 2 0 2 0 0 1 0 0 0 1 1 2
 0 1 2 0 0 1 0 2]


### Стратегии проведения кросс-валидации

#### KFold

In [50]:
for train_indices, test_indices in model_selection.KFold(n_splits = 5).split(iris):
    print (train_indices, test_indices)

[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3 5 6] [4]
[0 1 2 3 4 6] [5]
[0 1 2 3 4 5] [6]


In [51]:
for train_indices, test_indices in model_selection.KFold(n_splits = 2, shuffle = True).split(iris):
    print (train_indices, test_indices)

[1 2 3] [0 4 5 6]
[0 4 5 6] [1 2 3]


In [58]:
for train_indices, test_indices in model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1).split(iris):
    print (train_indices, test_indices)

[3 4 5] [0 1 2 6]
[0 1 2 6] [3 4 5]


#### StratifiedKFold

In [56]:
target = np.array([0] * 5 + [1] * 5)
print(target)
for train_indices, test_indices in model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0).split(iris.data, target):
      print (train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]


ValueError: Found input variables with inconsistent numbers of samples: [150, 10]

In [64]:
target = np.array([0, 1] * 5)
print(target)
for train_indices, test_indices in model_selection.StratifiedKFold(n_splits = 2,shuffle = True).split(iris.data[:10], target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[0 1 2 3 5] [4 6 7 8 9]
[4 6 7 8 9] [0 1 2 3 5]


#### ShuffleSplit

In [63]:
for train_indices, test_indices in model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2).split(iris.data[:10]):
    print(train_indices, test_indices)

[0 6 2 1 9 5 4 3] [7 8]
[2 9 3 0 7 4 8 6] [1 5]
[4 6 7 0 9 3 5 1] [8 2]
[9 4 1 8 3 5 0 6] [2 7]
[5 2 9 7 8 3 6 1] [0 4]
[0 6 5 8 4 1 3 2] [9 7]
[0 8 9 5 4 2 3 6] [7 1]
[3 6 8 1 7 9 5 2] [0 4]
[2 4 7 8 6 1 5 9] [3 0]
[4 3 6 5 1 9 8 7] [2 0]


#### StratifiedShuffleSplit

In [67]:
target = np.array([0] * 5 + [1] * 5)
for train_indices, test_indices in model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2).split(iris.data[:10] ,target):
    print(train_indices, test_indices)

[4 8 2 9 5 1 7 0] [6 3]
[2 7 6 0 1 8 4 5] [9 3]
[1 9 8 5 4 0 6 2] [3 7]
[5 8 0 4 2 1 7 9] [6 3]


#### Leave-One-Out

In [73]:
for train_indices, test_index in model_selection.LeaveOneOut().split(iris.data[:10]):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators