### from scratch

In [1]:
from math import sqrt
import pandas as pd

In [2]:
iris_df = pd.read_csv('Iris.csv')

In [3]:
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
dataset = []
for _, row in iris_df.iterrows():
    features = [float(row["SepalLengthCm"]), float(row["SepalWidthCm"]),
                float(row["PetalLengthCm"]), float(row["PetalWidthCm"])]
    label = row["Species"]
    dataset.append(features + [label])

In [None]:
# Example of getting neighbors for an instance
from math import sqrt

# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)
 
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = euclidean_distance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors

# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction

In [6]:
test_row = [5.7, 2.9, 4.2, 1.3, None]   # SepalLength, SepalWidth, PetalLength, PetalWidth
prediction = predict_classification(dataset, test_row, num_neighbors=5)
print('Data=%s, Predicted: %s' % (test_row[:-1], prediction))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: Iris-versicolor


## from library

In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv("Iris.csv")

In [None]:
# let's encode the label column 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["Species"] = encoder.fit_transform(df["Species"])
df[["Species"]]

Unnamed: 0,Species
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [10]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [11]:
# Scalling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
columns_to_scale = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

for col in columns_to_scale:
    df[col] = scaler.fit_transform(df[col].to_numpy().reshape(-1,1)) 

df.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,-0.900681,1.032057,-1.341272,-1.312977,0
1,2,-1.143017,-0.124958,-1.341272,-1.312977,0
2,3,-1.385353,0.337848,-1.398138,-1.312977,0
3,4,-1.506521,0.106445,-1.284407,-1.312977,0
4,5,-1.021849,1.26346,-1.341272,-1.312977,0


In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data
X = df.drop(columns=["Species", "Id"])
y = df["Species"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(120, 4)

(30, 4)

(120,)

(30,)

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

model = KNN(n_neighbors=3)
# fitting (training)
model.fit(X_train, y_train)

# predicting 
y_pred = model.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [14]:
pd.DataFrame({"Predictions" :y_pred,"True values": y_test})

Unnamed: 0,Predictions,True values
73,1,1
18,0,0
118,2,2
78,1,1
76,1,1
31,0,0
64,1,1
141,2,2
68,1,1
82,1,1


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0