<a href="https://colab.research.google.com/github/asrafulasf72/Ml-Algorithms/blob/main/kNN_calsification_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CSE 412 – Lab Activity 2: KNN Classification using scikit-learn
# ---------------------------------------------------------------
# This script:
# 1) Loads Iris data
# 2) Splits train/test, scales features, trains KNN
# 3) Tries k = 1..15 and records accuracy
# 4) Prints confusion matrix & classification report for best k
# 5) LAB EX-1: Uses 10-fold CV to get average accuracy for each k, picks the best
# 6) LAB EX-2: Tests different train/test ratios and reports which works best (on Iris)
# 7) LAB EX-3: Repeats ratio experiment on a small self-created dataset
# 8) Predicts two hand-made samples (as in manual)

import numpy as np
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -----------------------------
# 1) Load dataset & basic info
# -----------------------------
iris = load_iris()
X = iris.data          # shape (150, 4)
y = iris.target        # 0=setosa, 1=versicolor, 2=virginica
target_names = iris.target_names

print("Feature names:", iris.feature_names)
print("Target names:", target_names)
print("Data shape:", X.shape)   # (150, 4)

# ---------------------------------------------------
# 2) Train/Test split (70/30) + scaling + try k=1..15
# ---------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0, stratify=y
)

range_k = range(1, 16)
test_accuracies = {}

for k in range_k:
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=k))
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    test_accuracies[k] = acc

# Pick best k on this 70/30 split (if tie, choose smallest k)
best_k_split = min(
    [k for k, a in test_accuracies.items() if a == max(test_accuracies.values())]
)

print("\n=== 70/30 split results ===")
for k in range_k:
    print(f"k={k:2d}  | Test Accuracy = {test_accuracies[k]:.4f}")
print(f"Best k on this split = {best_k_split} (Accuracy={test_accuracies[best_k_split]:.4f})")

# Train with best k and show confusion matrix & classification report
best_pipe_split = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_k_split))
best_pipe_split.fit(X_train, y_train)
y_pred_best = best_pipe_split.predict(X_test)

print("\nConfusion Matrix (best k on 70/30 split):")
print(confusion_matrix(y_test, y_pred_best))

print("\nClassification Report (best k on 70/30 split):")
print(classification_report(y_test, y_pred_best, target_names=target_names))

# ------------------------------------------------------------
# 3) LAB EX-1: 10-fold CV accuracy for each k (average of 10)
# ------------------------------------------------------------
print("\n=== LAB EX-1: 10-fold CV average accuracy per k ===")
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_means = {}

for k in range_k:
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=k))
    scores = cross_val_score(pipe, X, y, cv=cv, scoring='accuracy')
    cv_means[k] = scores.mean()
    print(f"k={k:2d}  | CV mean accuracy over 10 folds = {cv_means[k]:.4f}")

best_k_cv = min([k for k, a in cv_means.items() if a == max(cv_means.values())])
print(f"Best k by 10-fold CV = {best_k_cv} (Avg Accuracy={cv_means[best_k_cv]:.4f})")

# -------------------------------------------------------------
# 4) LAB EX-2: Best train/test ratio on Iris (repeat 10 times)
# -------------------------------------------------------------
print("\n=== LAB EX-2: Find best train/test ratio on Iris ===")
# We'll evaluate test_size from 0.2 to 0.5
ratios = [0.20, 0.30, 0.40, 0.50]
ratio_results = {}

# Use the best k from CV for a fairer estimate
for r in ratios:
    accuracies = []
    for seed in range(10):  # 10 repeats
        X_tr, X_te, y_tr, y_te = train_test_split(
            X, y, test_size=r, random_state=seed, stratify=y
        )
        pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_k_cv))
        pipe.fit(X_tr, y_tr)
        y_hat = pipe.predict(X_te)
        accuracies.append(accuracy_score(y_te, y_hat))
    ratio_results[r] = (np.mean(accuracies), np.std(accuracies))

for r in ratios:
    mean_acc, std_acc = ratio_results[r]
    print(f"Test size={int(r*100)}% | Mean Acc over 10 runs = {mean_acc:.4f} ± {std_acc:.4f}")

best_ratio = max(ratio_results, key=lambda rr: ratio_results[rr][0])
print(f"Best test ratio on Iris (by mean acc) = {int(best_ratio*100)}% "
      f"(Mean Acc={ratio_results[best_ratio][0]:.4f})")

# --------------------------------------------------------------------
# 5) LAB EX-3: Create a small synthetic dataset & repeat ratio test
# --------------------------------------------------------------------
print("\n=== LAB EX-3: Self-created dataset & best ratio ===")
Xs, ys = make_classification(
    n_samples=300, n_features=4, n_informative=3, n_redundant=0,
    n_classes=3, class_sep=1.5, random_state=0
)

ratio_results_synth = {}
for r in ratios:
    accuracies = []
    for seed in range(10):  # 10 repeats
        X_tr, X_te, y_tr, y_te = train_test_split(
            Xs, ys, test_size=r, random_state=seed, stratify=ys
        )
        pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_k_cv))
        pipe.fit(X_tr, y_tr)
        y_hat = pipe.predict(X_te)
        accuracies.append(accuracy_score(y_te, y_hat))
    ratio_results_synth[r] = (np.mean(accuracies), np.std(accuracies))

for r in ratios:
    mean_acc, std_acc = ratio_results_synth[r]
    print(f"[Synthetic] Test size={int(r*100)}% | Mean Acc over 10 runs = {mean_acc:.4f} ± {std_acc:.4f}")

best_ratio_synth = max(ratio_results_synth, key=lambda rr: ratio_results_synth[rr][0])
print(f"Best test ratio on synthetic data = {int(best_ratio_synth*100)}% "
      f"(Mean Acc={ratio_results_synth[best_ratio_synth][0]:.4f})")

# ----------------------------------------------------
# 6) Predict hand-made samples (from the lab manual)
# ----------------------------------------------------
print("\n=== Predict hand-made samples ===")
# Train on all Iris data with best k from CV (more data = usually better)
final_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_k_cv))
final_pipe.fit(X, y)

classes = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
x_new = np.array([[1, 1, 1, 1], [4, 3, 1.3, 0.2]])
y_new = final_pipe.predict(x_new)
for i, pred in enumerate(y_new):
    print(f"x_new[{i}] -> class {pred} -> {classes[pred]}")
