### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
## checking the accuracy of the model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [3]:
data = pd.read_excel('./Datas/Dry_Bean_Dataset.xlsx')
data.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


### Split X and Y

In [8]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]
Y

0           SEKER
1           SEKER
2           SEKER
3           SEKER
4           SEKER
           ...   
13606    DERMASON
13607    DERMASON
13608    DERMASON
13609    DERMASON
13610    DERMASON
Name: Class, Length: 13611, dtype: object

### Label encoding

In [10]:
label_Y = LabelEncoder()
Y = label_Y.fit_transform(Y)
Y

array([5, 5, 5, ..., 3, 3, 3])

### Split into train test using stratify

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y,shuffle=True)
print(Y_test)

[3 1 4 ... 0 3 0]


### Scaler

In [14]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)
print(X_test)

[[-6.92354340e-01 -8.80035541e-01 -8.03388687e-01 ...  5.86711162e-01
   1.77069632e-01  8.65923709e-01]
 [ 4.90464091e+00  3.91320975e+00  3.75640554e+00 ... -1.63224457e+00
  -3.68005947e-01 -1.10054519e+00]
 [ 4.15884831e-02  2.97718394e-01  6.69006673e-01 ... -1.18563820e+00
  -1.60070255e+00  4.75469457e-01]
 ...
 [ 4.43058258e-01  6.31751089e-01  4.80537481e-01 ... -5.31175126e-01
   7.21826536e-04  5.07039586e-01]
 [-6.35628268e-01 -7.81751856e-01 -7.03887986e-01 ...  4.15448416e-01
   6.21980274e-02  6.78217014e-01]
 [ 7.16543046e-01  1.00966853e+00  8.93490097e-01 ... -8.89740101e-01
  -4.52825337e-01  3.57830681e-01]]


#### KNN

In [16]:
import math
math.sqrt(len(Y_test))

63.90618123468183

In [18]:
classifier = KNeighborsClassifier(n_neighbors=63,p=2,metric='euclidean')
classifier.fit(X_train, Y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=63)

In [20]:
y_pred = classifier.predict(X_test)
y_pred

array([3, 1, 4, ..., 0, 3, 0])

In [22]:
cm = confusion_matrix(Y_test, y_pred)
cm

array([[348,   0,  30,   0,   1,   3,  15],
       [  0, 157,   0,   0,   0,   0,   0],
       [  8,   0, 468,   0,   5,   1,   7],
       [  0,   0,   0, 969,   1,  27,  67],
       [  0,   0,  12,   1, 553,   0,  12],
       [  7,   0,   0,  18,   0, 564,  19],
       [  2,   0,   1,  76,   8,   4, 700]])

In [24]:
class_report = classification_report(Y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       397
           1       1.00      1.00      1.00       157
           2       0.92      0.96      0.94       489
           3       0.91      0.91      0.91      1064
           4       0.97      0.96      0.97       578
           5       0.94      0.93      0.93       608
           6       0.85      0.88      0.87       791

    accuracy                           0.92      4084
   macro avg       0.94      0.93      0.93      4084
weighted avg       0.92      0.92      0.92      4084



### Cross validation


In [27]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

kf = StratifiedKFold(n_splits=5,shuffle=True, random_state=0)
cv_scores = cross_val_score(classifier,X_train,Y_train,cv=kf)
cv_scores

array([0.91080797, 0.91448059, 0.91968504, 0.93333333, 0.91286089])