# Импортируем датасет

In [5]:
import pandas as pd

In [6]:
dataset = pd.read_csv('wine.csv')
dataset.head()

Unnamed: 0,class,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = dataset.drop(columns=['class']).values
y = dataset['class'].values

# Делаем кросс-валидацию

In [12]:
from sklearn.model_selection import KFold

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(X)

5

Посмотрим на фолды

In [15]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[  0   1   2   3   4   5   6   7   8  10  11  13  14  17  20  21  22  23
  25  26  27  28  32  33  34  35  36  37  38  39  40  43  44  46  47  48
  49  50  51  52  53  54  56  57  58  59  61  62  63  64  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  83  84  85  86  87  88  89  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 110
 112 115 116 120 121 122 123 124 125 126 127 129 130 131 132 133 134 135
 136 137 138 139 142 143 144 146 147 148 149 151 152 153 154 155 156 157
 158 159 160 161 162 163 165 166 167 168 170 172 173 175 176 177]
  Test:  index=[  9  12  15  16  18  19  24  29  30  31  41  42  45  55  60  65  66  67
  82  90 109 111 113 114 117 118 119 128 140 141 145 150 164 169 171 174]
Fold 1:
  Train: index=[  0   1   3   5   7   8   9  10  12  13  14  15  16  17  18  19  20  21
  23  24  25  28  29  30  31  33  34  35  37  39  40  41  42  43  44  45
  46  47  48  49  50  52  53  54  55  57  58  59  60  61  62  63  64 

# Тестовое используем алгоритм kNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

In [23]:
neigh = KNeighborsClassifier(n_neighbors=3)
cross_val = cross_val_score(neigh, X, y, cv=5, scoring='accuracy')  # передаем алгоритм, датасет, кол-во фолдов

In [22]:
cross_val  # получаем accuracy на каждом фолде

array([0.63888889, 0.66666667, 0.66666667, 0.62857143, 0.82857143])

In [25]:
np.mean(cross_val)  # усредняем

np.float64(0.6858730158730159)

# Подбираем параметр k

In [33]:
accuracy = {}
maxi = -1
k = 0

for i in range(1, 51):
  knn = KNeighborsClassifier(n_neighbors=i)
  cross_val = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
  accuracy[i] = float(np.mean(cross_val))
  if accuracy[i] > maxi:
    maxi = accuracy[i]
    k = i

In [34]:
print(f"Оптимальное количество соседей {k}, accuracy = {maxi}")

Оптимальное количество соседей 32, accuracy = 0.7142857142857142


# Проводим масштабирование

In [35]:
from sklearn.preprocessing import scale

In [38]:
X_scale = scale(X)

In [39]:
X_scale

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.00942431],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96224586],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.38685185],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28602152],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.30174767],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -2.16411227]])

# Снова ищем оптимальное k, после масштабирования

In [44]:
accuracy_scale = {}
maxi = -1
k = 0

for i in range(1, 51):
  knn_scale = KNeighborsClassifier(n_neighbors=i)
  cross_val_scale = cross_val_score(knn_scale, X_scale, y, cv=5, scoring='accuracy')
  accuracy_scale[i] = float(np.mean(cross_val_scale))
  if accuracy_scale[i] > maxi:
    maxi = accuracy_scale[i]
    k = i

In [45]:
print(f"Оптимальное количество соседей (с масштабированием) {k}, accuracy = {maxi}")

Оптимальное количество соседей (с масштабированием) 7, accuracy = 0.9665079365079364
