In [1]:
# knn_pima_diabetes

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# -------------------------------
# Load dataset from a working GitHub URL
# -------------------------------
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = [
    "pregnancies", "glucose", "blood_pressure", "skin_thickness",
    "insulin", "bmi", "diabetes_pedigree", "age", "outcome"
]

data = pd.read_csv(url, header=None, names=columns)

# -------------------------------
# Features and target
# -------------------------------
X = data.drop("outcome", axis=1)
y = data["outcome"]

# -------------------------------
# Train/test split (25% test)
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# -------------------------------
# Feature scaling (important for k-NN)
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------------
# k-NN experiments
# -------------------------------
k_values = [3, 5, 7]

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)

    y_pred = knn.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"k = {k}")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion matrix:")
    print(cm)
    print("-" * 40)


k = 3
Accuracy: 0.6979
Confusion matrix:
[[98 25]
 [33 36]]
----------------------------------------
k = 5
Accuracy: 0.6823
Confusion matrix:
[[94 29]
 [32 37]]
----------------------------------------
k = 7
Accuracy: 0.6979
Confusion matrix:
[[96 27]
 [31 38]]
----------------------------------------


## k-NN Classification on Pima Indians Diabetes Dataset

The table below shows the accuracy and confusion matrices for different values of `k`:

| k | Accuracy | Confusion Matrix |
|---|----------|----------------|
| 3 | 0.698    | [[98 25], [33 36]] |
| 5 | 0.682    | [[94 29], [32 37]] |
| 7 | 0.698    | [[96 27], [31 38]] |

**Observations:**

- Accuracy is moderate, around **68â€“70%**.
- Misclassifications mostly occur for **class 1 (diabetes-positive)**.
- Changing `k` affects performance:
  - `k=3` and `k=7` have similar accuracies.
  - `k=5` is slightly lower, possibly due to smoothing over more neighbors.
- Small `k` captures local patterns but can be noisy; large `k` smooths predictions but may underfit.
- Overall, k-NN provides a reasonable baseline, though more advanced models may improve sensitivity for positive cases.
