# DATA PREPARATION

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
np.random.seed(42)
df = pd.read_csv('Iris.csv')
print(f"Dataset shape: {df.shape}")
X = df.iloc[:, 1:5]  
y = df['Species']    

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
Train_set = np.column_stack((X_train, y_train))
Val_set = np.column_stack((X_val, y_val))
Test_set = np.column_stack((X_test, y_test))
print(f"Training: {len(Train_set)} samples ({len(Train_set)/len(df)*100:.1f}%)")
print(f"Validation: {len(Val_set)} samples ({len(Val_set)/len(df)*100:.1f}%)")
print(f"Test: {len(Test_set)} samples ({len(Test_set)/len(df)*100:.1f}%)")

Dataset shape: (150, 6)
Training: 105 samples (70.0%)
Validation: 22 samples (14.7%)
Test: 23 samples (15.3%)


# KNN CLASSIFICATION

In [2]:
def knn_classify(train_set, sample, k=5):
    
    distances = []
    vx = sample[:-1]  
    for train_sample in train_set:
        tx = train_sample[:-1]  
        
        distance = np.sqrt(np.sum((vx - tx)**2))
        
        distances.append((train_sample, distance))
    distances.sort(key=lambda x: x[1])
    k_nearest = distances[:k]
    k_classes = [sample[0][-1] for sample in k_nearest]
    from collections import Counter
    class_counts = Counter(k_classes)
    predicted_class = class_counts.most_common(1)[0][0]
    return predicted_class

def calculate_accuracy(train_set, val_set, k=5):
    correct = 0
    for val_sample in val_set:
        true_class = val_sample[-1]
        
        predicted_class = knn_classify(train_set, val_sample, k)
        
        if predicted_class == true_class:
            correct += 1
    accuracy = (correct / len(val_set)) * 100
    return accuracy

# KNN CLASSIFICATION RESULTS

In [11]:
k_values = [1, 3, 5, 10, 15]
validation_accuracies = {}
for k in k_values:
    accuracy = calculate_accuracy(Train_set, Val_set, k)
    validation_accuracies[k] = accuracy
best_k = max(validation_accuracies, key=validation_accuracies.get)
test_accuracy = calculate_accuracy(Train_set, Test_set, best_k)
print(f"K values | Validation Accuracy (%)")
print("-" * 40)
for k in k_values:
    print(f"{k} \t |\t   {validation_accuracies[k]:.2f}")

print(f"\nTest Accuracy: {test_accuracy:.2f}% with K={best_k}")

K values | Validation Accuracy (%)
----------------------------------------
1 	 |	   90.91
3 	 |	   90.91
5 	 |	   95.45
10 	 |	   90.91
15 	 |	   90.91

Test Accuracy: 100.00% with K=5
