**KNN - K-nearest neighbours**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/bipulshahi/Dataset/refs/heads/main/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
df.shape

(150, 6)

In [None]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
df1 = df.copy()

In [None]:
#Predict the species of the flower if the sepel_length, sepel_width, petal_length, petal_width is known
new_feature = [3.1 , 2.5 , 3.4 , 3.9]

In [None]:
x1 = df['SepalLengthCm']
x2 = df['SepalWidthCm']
x3 = df['PetalLengthCm']
x4 = df['PetalWidthCm']

#distance from new_feature to all existing data points
df1['distances'] = ((x1 - new_feature[0])**2 + (x2 - new_feature[1])**2 + (x3 - new_feature[2])**2 + (x4 -  new_feature[3]))**2

In [None]:
df1.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,distances
0,1,5.1,3.5,1.4,0.2,Iris-setosa,28.09
1,2,4.9,3.0,1.4,0.2,Iris-setosa,14.3641
2,3,4.7,3.2,1.3,0.2,Iris-setosa,14.1376
3,4,4.6,3.1,1.5,0.2,Iris-setosa,6.3504
4,5,5.0,3.6,1.4,0.2,Iris-setosa,26.2144


In [None]:
df1_sorted = df1.sort_values(by = 'distances')
df1_sorted.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,distances
57,58,4.9,2.4,3.3,1.0,Iris-versicolor,0.1296
93,94,5.0,2.3,3.3,1.0,Iris-versicolor,0.5776
60,61,5.0,2.0,3.5,1.0,Iris-versicolor,0.9409
98,99,5.1,2.5,3.0,1.1,Iris-versicolor,1.8496
8,9,4.4,2.9,1.4,0.2,Iris-setosa,4.6225


* Top 5 nearest data points from "new_feature = [3.1 , 2.5 , 3.4 , 3.9]" is shown above
* Majority of closest data points are 'Iris-versicolor'. So, "new_feature = [3.1 , 2.5 , 3.4 , 3.9]" can be classified is "Iris-versicolor"

In [None]:
df1_sorted['Species'].head(5).mode()[0]

'Iris-versicolor'

**Scikit-Learn for KNN**

In [None]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm' ,	'PetalWidthCm']]
y = df['Species']

In [None]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size = 0.75)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

modelA = KNeighborsClassifier(n_neighbors=5)

modelA.fit(xtrain,ytrain)

In [None]:
new_feature = [3.1 , 2.5 , 3.4 , 3.9]
print(modelA.predict([new_feature]))

['Iris-versicolor']


In [None]:
print(modelA.score(xtrain,ytrain))

0.9642857142857143


In [None]:
print(modelA.score(xtest,ytest))

0.9736842105263158


**Accuracy for each category**

  * Accuracy for Iris Setosa
  * Accuracy for Iris Versicolor
  * Accuracy for Iris Viriginica

In [None]:
ytrainPred = modelA.predict(xtrain)
ytestPred = modelA.predict(xtest)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(ytrain , ytrainPred))

[[36  0  0]
 [ 0 34  2]
 [ 0  2 38]]


In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytrain , ytrainPred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        36
Iris-versicolor       0.94      0.94      0.94        36
 Iris-virginica       0.95      0.95      0.95        40

       accuracy                           0.96       112
      macro avg       0.96      0.96      0.96       112
   weighted avg       0.96      0.96      0.96       112



In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest , ytestPred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       1.00      0.93      0.96        14
 Iris-virginica       0.91      1.00      0.95        10

       accuracy                           0.97        38
      macro avg       0.97      0.98      0.97        38
   weighted avg       0.98      0.97      0.97        38



**Use the following data to predict the class of wine using the chemical composotion information given**

In [None]:
df_wine = pd.read_csv('https://raw.githubusercontent.com/bipulshahi/Dataset/refs/heads/main/wine.csv')
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0


In [None]:
df_wine['Target'].unique()

array([0, 1, 2])

In [None]:
Xw = df_wine.drop(columns = ['Target'])
yw = df_wine['Target']

In [None]:
Xw.head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

Xw_scaled = scaler.fit_transform(Xw)

In [None]:
Xw_scaled

array([[0.84210526, 0.1916996 , 0.57219251, ..., 0.45528455, 0.97069597,
        0.56134094],
       [0.57105263, 0.2055336 , 0.4171123 , ..., 0.46341463, 0.78021978,
        0.55064194],
       [0.56052632, 0.3201581 , 0.70053476, ..., 0.44715447, 0.6959707 ,
        0.64693295],
       ...,
       [0.58947368, 0.69960474, 0.48128342, ..., 0.08943089, 0.10622711,
        0.39728959],
       [0.56315789, 0.36561265, 0.54010695, ..., 0.09756098, 0.12820513,
        0.40085592],
       [0.81578947, 0.66403162, 0.73796791, ..., 0.10569106, 0.12087912,
        0.20114123]])

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(Xw_scaled, yw, train_size=0.75)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
modelB = KNeighborsClassifier()

modelB.fit(xtrain,ytrain)

In [None]:
print(modelB.score(xtrain,ytrain))
print(modelB.score(xtest,ytest))

0.9548872180451128
0.9777777777777777
