In [None]:
# =================================================================================
# Import Libraries (All Levels)
# =================================================================================
import numpy as np  # Numerical computing
import pandas as pd  # Data manipulation
from sklearn import datasets  # Built-in datasets
from sklearn.model_selection import train_test_split, cross_val_score  # Data splitting/validation
from sklearn.preprocessing import StandardScaler  # Feature scaling
from sklearn.neighbors import KNeighborsClassifier  # KNN algorithm
from sklearn.naive_bayes import GaussianNB  # Naive Bayes algorithm
from sklearn.metrics import accuracy_score, classification_report  # Evaluation metrics
import matplotlib.pyplot as plt  # Visualization
from matplotlib.colors import ListedColormap  # Custom color maps for accessibility

# =================================================================================
# 1. Load Data (Novice-Friendly)
# =================================================================================
# [Novice]: The Iris dataset contains measurements of 150 iris flowers (3 species).
# We use it because it's simple and helps learn classification basics.
iris = datasets.load_iris()
X = iris.data  # Features: sepal length/width, petal length/width
y = iris.target  # Labels: 0=setosa, 1=versicolor, 2=virginica

# =================================================================================
# 2. Split Data (Intermediate)
# =================================================================================
# [Intermediate]: Stratified splitting preserves class distribution in train/test sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,  # Ensures reproducibility
    stratify=y  # Balances class distribution
)

# =================================================================================
# 3. Feature Scaling (Expert Note)
# =================================================================================
# [Expert]: KNN is distance-based; scaling ensures features contribute equally.
# StandardScaler transforms data to mean=0, std=1. Fit only on training data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit to training data
X_test_scaled = scaler.transform(X_test)  # Apply to test data

# =================================================================================
# 4. K-Nearest Neighbors (KNN) (All Levels)
# =================================================================================
# [Novice]: KNN predicts labels based on the majority of nearest neighbors.
# [Intermediate]: K=3 means we consider the 3 closest data points.
# [Expert]: Complexity O(nd) for prediction, where n=samples, d=features.
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)  # Train model on scaled data
y_pred_knn = knn.predict(X_test_scaled)

# Evaluate performance
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

# =================================================================================
# 5. Gaussian Naive Bayes (Technical Deep Dive)
# =================================================================================
# [Novice]: Uses probability to predict classes (fast but assumes feature independence).
# [Expert]: Applies Bayes' theorem with Gaussian likelihood:
# P(y|x) ∝ P(x|y) * P(y), where P(x|y) ~ N(μ_y, σ_y²)
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)  # No scaling needed for Naive Bayes
y_pred_nb = naive_bayes.predict(X_test)

# Evaluate performance
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

# =================================================================================
# 6. Cross-Validation (Intermediate/Expert)
# =================================================================================
# [Intermediate]: 5-fold CV averages performance across different data splits.
# [Expert]: Prevents overfitting; use StratifiedKFold for class imbalance.
cv_knn = cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
cv_nb = cross_val_score(naive_bayes, X, y, cv=5, scoring='accuracy').mean()
print(f"\nCross-Validation Scores:\nKNN: {cv_knn:.4f}, Naive Bayes: {cv_nb:.4f}")

# =================================================================================
# 7. Visualization (Accessibility-First)
# =================================================================================
# [Novice]: We visualize decision boundaries to see how models separate classes.
# [Accessibility]: Colorblind-friendly palettes (avoid red-green), high-contrast labels.

# Prepare 2D data (using first two features for simplicity)
X_train_2d = X_train[:, :2]  # Sepal length/width only
X_test_2d = X_test[:, :2]

# Re-train models on 2D data (for visualization only)
knn_2d = KNeighborsClassifier(n_neighbors=3).fit(X_train_2d, y_train)
nb_2d = GaussianNB().fit(X_train_2d, y_train)

# Create meshgrid for decision boundaries
x_min, x_max = X_train_2d[:, 0].min() - 1, X_train_2d[:, 0].max() + 1
y_min, y_max = X_train_2d[:, 1].min() - 1, X_train_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))

# Accessibility: Colorblind-friendly colormaps (blue, orange, teal)
cmap = ListedColormap(['#1f77b4', '#ff7f0e', '#2ca02c'])  # Blue, Orange, Green
cmap_light = ListedColormap(['#a1c9f4', '#ffd699', '#b2e2b2'])  # Light versions

plt.figure(figsize=(14, 6), dpi=100)  # High-resolution figure

# Plot KNN boundaries
plt.subplot(1, 2, 1)
Z_knn = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy, Z_knn, alpha=0.4, cmap=cmap_light)
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap=cmap, edgecolor='k', s=50)
plt.title("KNN Decision Boundaries (2D)", fontsize=14, pad=20)
plt.xlabel('Sepal Length (cm)', fontsize=12)
plt.ylabel('Sepal Width (cm)', fontsize=12)

# Plot Naive Bayes boundaries
plt.subplot(1, 2, 2)
Z_nb = nb_2d.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy, Z_nb, alpha=0.4, cmap=cmap_light)
scatter = plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap=cmap, edgecolor='k', s=50)
plt.title("Naive Bayes Decision Boundaries (2D)", fontsize=14, pad=20)
plt.xlabel('Sepal Length (cm)', fontsize=12)

# Add universal legend with proper argument handling
handles, _ = scatter.legend_elements(prop="colors")  # Explicitly get color handles
plt.legend(
    handles=handles,
    title='Species',
    labels=['Setosa', 'Versicolor', 'Virginica'],  # Custom labels for clarity
    loc='lower right',
    framealpha=0.9,  # High contrast background
    edgecolor='black'  # Added border for better visibility
)

plt.tight_layout()

# Save the plot as a PNG file (high resolution, accessible colors)
plt.savefig(
    'KNN vs Naive.png',  # File name
    dpi=300,                    # High resolution for clarity
    bbox_inches='tight',        # Prevent cropping
    facecolor='white'           # Accessible background
)

plt.show()  # Display the plot