In [1]:
# ================================================================
# Project: Iris Flower Classification using K-Nearest Neighbors (KNN)
# ================================================================
"""
Problem Statement:
------------------
The Iris dataset contains 150 samples of iris flowers from three species:
Setosa, Versicolor, and Virginica. Each sample has four features:
- sepal length
- sepal width
- petal length
- petal width

The goal is to build a machine learning model that can accurately
predict the species of an iris flower given its measurements.

Methodology:
------------
We are using the K-Nearest Neighbors (KNN) algorithm:

1. KNN is a **lazy learning algorithm**, meaning it does not build a model
   during training. Instead, it stores all training samples.

2. During **training**:
   - The algorithm simply memorizes all training samples and their labels.

3. During **testing/prediction**:
   - For a new sample, KNN calculates the distance from this sample
     to all stored training samples.
   - It finds the K closest samples (neighbors).
   - The predicted class is determined by **majority vote** among these neighbors.

Key Points:
-----------
- KNN is distance-based, so **feature scaling** is important.
- KNN can predict **one sample at a time** or **multiple samples at once**.
- Evaluation metrics include **accuracy**, **precision**, **recall**, **F1-score**, and **confusion matrix**.
"""

# =======================
# 1️⃣ Import Libraries
# =======================
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# =======================
# 2️⃣ Load Dataset
# =======================
iris = load_iris()
X = iris.data  # Features: sepal length, sepal width, petal length, petal width
y = iris.target  # Labels: 0=Setosa, 1=Versicolor, 2=Virginica

# Optional: Encode labels if they are categorical strings
# le = LabelEncoder()
# y = le.fit_transform(y)

# =======================
# 3️⃣ Split Dataset
# =======================
# Split the data into training (80%) and testing (20%) sets
# Training data will be stored by KNN, testing data will be used to evaluate performance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =======================
# 4️⃣ Feature Scaling
# =======================
# KNN relies on distance calculation; scaling ensures all features contribute equally
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# =======================
# 5️⃣ Initialize KNN Classifier
# =======================
# n_neighbors = K (number of nearest neighbors to consider)
# We are using K=3 for this example
knn = KNeighborsClassifier(n_neighbors=3)

# =======================
# 6️⃣ Train the Model
# =======================
# In KNN, training just stores the dataset; no actual learning occurs
knn.fit(X_train, y_train)

# =======================
# 7️⃣ Predict on Test Set
# =======================
# For each test sample:
#   1. Calculate distance to all training samples
#   2. Find K nearest neighbors
#   3. Take majority vote to assign class
y_pred = knn.predict(X_test)

# =======================
# 8️⃣ Evaluate Model
# =======================
# Accuracy: overall correctness of predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}\n")

# Classification report: precision, recall, F1-score
# Precision: How correct are the model's predictions for each class? (column-based)
# Recall: How many of the actual samples of each class were correctly predicted? (row-based)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix: visualizes correct and incorrect predictions
# Rows = actual class, Columns = predicted class
# Diagonal = correct predictions
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# =======================
# 9️⃣ Predict a Single Sample
# =======================
# Example: predict the species of a new flower
sample = np.array([[5.1, 3.5, 1.4, 0.2]])  # single sample
sample_scaled = scaler.transform(sample)   # scale features like training data
pred_class = knn.predict(sample_scaled)
print("\nPredicted Class for Sample:", iris.target_names[pred_class][0])

# =======================
#  🔹 Optional: Predict Multiple Samples
# =======================
samples = np.array([
    [5.1, 3.5, 1.4, 0.2],
    [6.0, 2.9, 4.5, 1.5],
    [6.9, 3.1, 5.4, 2.1]
])
samples_scaled = scaler.transform(samples)
pred_classes = knn.predict(samples_scaled)
print("\nPredicted Classes for Multiple Samples:", iris.target_names[pred_classes])


Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

Predicted Class for Sample: setosa

Predicted Classes for Multiple Samples: ['setosa' 'versicolor' 'virginica']
