# Week 8 – K-Nearest Neighbors on CKD Dataset

This notebook applies **K-Nearest Neighbors (KNN)** to the Chronic Kidney Disease dataset for:

- **Classification**: Predicting the binary `Diagnosis` label  
- **Regression**: Predicting the continuous lab value `SerumCreatinine`  

The steps follow the Milestone Two summary document and are intended for use as appendix evidence.


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt

DATA_PATH = "Chronic_Kidney_Dsease_data.csv"

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

## Basic Cleaning and Feature Setup

In [None]:
# Drop identifier-like / non-informative columns
df = df.drop(columns=["PatientID", "DoctorInCharge"], errors="ignore")

# Separate targets
y_clf = df["Diagnosis"]
y_reg = df["SerumCreatinine"]

X = df.drop(columns=["Diagnosis", "SerumCreatinine"], errors="ignore")

# Identify numeric vs categorical columns
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_features, categorical_features

## Preprocessing Pipelines

In [None]:
# Numeric: median imputation + standardization
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: most frequent + one-hot
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Train-test split (same split used for both tasks for consistency)
X_train, X_test, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# For regression, we reuse X_train/X_test but keep a separate target
_, _, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

## KNN Classification – Predicting Diagnosis

In [None]:
knn_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", KNeighborsClassifier())
])

param_grid_clf = {
    "model__n_neighbors": [3, 5, 7, 9, 11, 15],
    "model__weights": ["uniform", "distance"],
    "model__p": [1, 2]  # Manhattan vs Euclidean
}

grid_clf = GridSearchCV(
    knn_clf,
    param_grid_clf,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_clf.fit(X_train, y_train_clf)

print("Best params (classification):", grid_clf.best_params_)
print("Best CV F1:", grid_clf.best_score_)

best_clf = grid_clf.best_estimator_
y_pred_clf = best_clf.predict(X_test)
y_proba_clf = best_clf.predict_proba(X_test)[:, 1]

print("\nTest Accuracy:", accuracy_score(y_test_clf, y_pred_clf))
print("Test F1:", f1_score(y_test_clf, y_pred_clf))
print("Test ROC-AUC:", roc_auc_score(y_test_clf, y_proba_clf))
print("\nClassification report:\n", classification_report(y_test_clf, y_pred_clf))

cm = confusion_matrix(y_test_clf, y_pred_clf)
print("\nConfusion matrix:\n", cm)

## KNN Regression – Predicting SerumCreatinine

In [None]:
knn_reg = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", KNeighborsRegressor())
])

param_grid_reg = {
    "model__n_neighbors": [3, 5, 7, 9, 11, 15],
    "model__weights": ["uniform", "distance"],
    "model__p": [1, 2]
}

grid_reg = GridSearchCV(
    knn_reg,
    param_grid_reg,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

grid_reg.fit(X_train, y_train_reg)

print("Best params (regression):", grid_reg.best_params_)
print("Best CV MAE:", -grid_reg.best_score_)

best_reg = grid_reg.best_estimator_
y_pred_reg = best_reg.predict(X_test)

mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)

print("\nTest MAE:", mae)
print("Test RMSE:", rmse)

### Residual Plot for Regression

This quick plot helps visualize how well KNN regression approximates `SerumCreatinine` on the test set.


In [None]:
plt.figure(figsize=(5, 5))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.5)
plt.xlabel("True SerumCreatinine")
plt.ylabel("Predicted SerumCreatinine")
plt.title("KNN Regression – True vs Predicted")
plt.plot([y_test_reg.min(), y_test_reg.max()],
         [y_test_reg.min(), y_test_reg.max()])
plt.show()