In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error, r2_score


In [2]:
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
print(iris.DESCR[18:1000])


Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)


In [4]:
data = pd.DataFrame(iris.data, columns = iris.feature_names)
data.head(-10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
135,7.7,3.0,6.1,2.3
136,6.3,3.4,5.6,2.4
137,6.4,3.1,5.5,1.8
138,6.0,3.0,4.8,1.8


In [5]:
print(iris.target_names)
print(iris.target)

['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
logistic_regres = LogisticRegression(max_iter=1000)

# разбивщик на 5 кросс-датасетов
cv = KFold(n_splits=5)

for split_idx, (train_idx, test_idx) in enumerate(cv.split(iris.data, iris.target)):
    x_train, x_test = iris.data  [train_idx], iris.data  [test_idx]
    y_train, y_test = iris.target[train_idx], iris.target[test_idx]
    
    # обучаем
    logistic_regres.fit(x_train, y_train)
    # проверяем
    score = logistic_regres.score(x_test, y_test)
    
    print("Split {}, score: {:.2f}".format(split_idx, score))

Split 0, score: 1.00
Split 1, score: 1.00
Split 2, score: 0.87
Split 3, score: 0.93
Split 4, score: 0.83


In [7]:
# Усреднее по полученным результатам с помощью 
# cross_val_score()

cv_score = cross_val_score(
    logistic_regres, iris.data, iris.target,
    scoring="accuracy", cv=cv
)

print("Cross value score", cv_score)
print("Mean cross val score", cv_score.mean())

Cross value score [1.         1.         0.86666667 0.93333333 0.83333333]
Mean cross val score 0.9266666666666665
