In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iris-dataset/iris.csv


Loading the Iris dataset

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/iris-dataset/iris.csv')

display(df.head())
print(df.shape)
print(df['species'].value_counts())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


(150, 5)
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64


BASIC EDA to ensure there are no missing values in the data

In [3]:
print(df.isna().sum())
display(df.describe())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Assigning numbers to labels so our code can work with them

In [4]:
assign_nums = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
reverse_assign = {value: key for key, value in assign_nums.items()}

y = df['species'].map(assign_nums).to_numpy()
x = df.drop(columns = ['species']).to_numpy(dtype = float)  # shape (150, 4)

print(np.unique(y, return_counts = True))

(array([0, 1, 2]), array([50, 50, 50]))


TRAIN / VALIDATION / TEST

In [5]:
# we'll do 60% train, 20% validation and 20% test

rng = np.random.default_rng(50)
idx = np.arange(len(x))
rng.shuffle(idx)
x = x[idx]
y = y[idx]

n = len(x)
n_train = int(0.6 * n)  # 60% train
n_valid = int(0.2 * n)  # 20% validation

x_train, y_train = x[:n_train], y[:n_train]
x_valid, y_valid = x[n_train: n_train + n_valid], y[n_train: n_train + n_valid]
x_test, y_test = x[n_train + n_valid:], y[n_train + n_valid:]

x_train.shape, x_valid.shape, x_test.shape

# validation set lets us pick the best hyperparameter without touching the test set

((90, 4), (30, 4), (30, 4))

STANDARDIZE FEATURES

In [6]:
mu = x.mean(axis = 0)
sigma = x_train.std(axis = 0, ddof = 0)
sigma[sigma == 0] = 1.0
# finds all elements in the sigma array that are equal to zero and replaces them with 1.0

def standardize(x, mu, sigma):
    return (x - mu) / sigma

x_train_s = standardize(x_train, mu, sigma)
x_valid_s = standardize(x_valid, mu, sigma)
x_test_s = standardize(x_test, mu, sigma)

#avoids data leakage and makes data distances meaningful

KNN Implementation

1) Distance Function

In [7]:
import numpy as np
from collections import Counter

def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

Single point prediction

In [8]:
def knn_single_prediction(x_train, y_train, z, k = 5):
    distances = [(euclidean_distance(z, z_train), y) for z_train, y in zip(x_train, y_train)]
    # all distances

    distances.sort(key=lambda tup: tup[0])
    #sort all distances

    k_label = [label for _, label in distances[:k]]
    #getting the k nearest neighbours

    return Counter(k_label).most_common(1)[0][0]
    #majority vote
    

Since we defined a function to predit for a single point, now let's loop over the function to predict for multiple points

In [9]:
def knn_prediction(x_train, y_train, x, k = 5):
    return np.array([knn_single_prediction(x_train, y_train, z, k) for z in x])

Accuracy function

In [10]:
def accuracy(y_true, y_predicted):
    return (y_true == y_predicted).mean()

Validation set test

In [11]:
candidates = [1, 3, 5, 7, 9, 11, 13, 15]
best_k, best_acc = None, -1

for k in candidates:
    valid_preds = knn_prediction(x_train_s, y_train, x_valid_s, k = k)
    acc = accuracy(y_valid, valid_preds)

    print(f"k={k}, validation accuracy={acc:.3f}")

    if acc > best_acc:
        best_acc, best_k = acc, k

print("Best k found: ", best_k, "with accuracy: ", round(best_acc, 3))

k=1, validation accuracy=0.933
k=3, validation accuracy=0.933
k=5, validation accuracy=0.967
k=7, validation accuracy=1.000
k=9, validation accuracy=1.000
k=11, validation accuracy=1.000
k=13, validation accuracy=1.000
k=15, validation accuracy=0.967
Best k found:  7 with accuracy:  1.0


Evaluation on test set

In [12]:
yt_pred = knn_prediction(x_train_s, y_train, x_test_s, k = best_k)
test_acc = accuracy(y_test, yt_pred)

print("Final test accuracy: ", round(test_acc, 3))

Final test accuracy:  0.967


More Evaluation

In [13]:
def confusion_matrix(y_true, y_predicted, classes = 3):
    mat = np.zeros((classes, classes), dtype = int)

    for t, p in zip(y_true, y_predicted):
        mat[t, p] += 1

    return mat

cm = confusion_matrix(y_test, yt_pred, classes = 3)

print("Confusion Matrix: \n", cm)

Confusion Matrix: 
 [[ 8  0  0]
 [ 0  9  0]
 [ 0  1 12]]


Precision, Recall and F1-score

In [14]:
def precision_recall_f1(cm):
    classes = cm.shape[0]
    metrics = {}

    for i in range(classes):
        TP = cm[i, i]
        FP = cm[:, i].sum() - TP
        FN = cm[i, :].sum() - TP
        TN = cm.sum() - (TP + FP + FN)

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        metrics[i] = {
            'precision': round((precision), 3),
            'recall': round((recall), 3),
            'f1': round((f1), 3)
        }

    return metrics

Macro and micro averages

In [15]:
def average_scores(metrics, cm):
    # macro averages
    macro_precision = np.mean([v["precision"] for v in metrics.values()])
    macro_recall = np.mean([v["recall"] for v in metrics.values()])
    macro_f1 = np.mean([v["f1"] for v in metrics.values()])

     # micro averages
    TP = np.trace(cm)
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)

    micro_precision = TP / (TP + FP.sum())
    micro_recall    = TP / (TP + FN.sum())
    micro_f1        = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)

    return {
        "Macro averages ": {
        "precision": round(macro_precision, 3),
        "recall": round(macro_recall, 3),
        "f1-score": round(macro_f1, 3)
        },

        "Micro averages ": {
            'precision': round(micro_precision, 3),
            'recall': round(micro_recall, 3),
            'f1-score': round(micro_f1, 3)
        }
    }

metrics = precision_recall_f1(cm)
avg_scores = average_scores(metrics, cm)
print("Average scores: ", avg_scores)

Average scores:  {'Macro averages ': {'precision': 0.967, 'recall': 0.974, 'f1-score': 0.969}, 'Micro averages ': {'precision': 0.967, 'recall': 0.967, 'f1-score': 0.967}}


Let's map our labels 0, 1 and 2 back to the flower classes

In [16]:
class_names = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}

for label, vals in metrics.items():
    print(f"{class_names[label]}: {vals}")

Setosa: {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
Versicolor: {'precision': 0.9, 'recall': 1.0, 'f1': 0.947}
Virginica: {'precision': 1.0, 'recall': 0.923, 'f1': 0.96}


Now let's report back everything we classified and calculated uptil now

In [17]:
def classification_report(cm, avg_scores, metrics, class_names):
    print("Confusion Matrix:\n", cm, "\n")
    print("Per-class Metrics:")
    for label, vals in metrics.items():
        print(f"{class_names[label]} -> {vals}")
    print("\nOverall Averages:")
    print(avg_scores)

classification_report(cm, avg_scores, metrics, class_names)

Confusion Matrix:
 [[ 8  0  0]
 [ 0  9  0]
 [ 0  1 12]] 

Per-class Metrics:
Setosa -> {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
Versicolor -> {'precision': 0.9, 'recall': 1.0, 'f1': 0.947}
Virginica -> {'precision': 1.0, 'recall': 0.923, 'f1': 0.96}

Overall Averages:
{'Macro averages ': {'precision': 0.967, 'recall': 0.974, 'f1-score': 0.969}, 'Micro averages ': {'precision': 0.967, 'recall': 0.967, 'f1-score': 0.967}}
