# Week 4 — Logistic Regression & Feature Scaling (CKD)
**Integrated Capstone — Week 4 Notebook**  

This notebook creates a **binary CKD label from GFR** and compares **Logistic Regression** with and without **feature scaling**.

We will:
- Build a binary target: **CKD_flag = 1 if GFR < 60, else 0**
- Compare pipelines (no scaling vs. StandardScaler)
- Tune regularization via **LogisticRegressionCV**
- Evaluate with **Accuracy, Precision, Recall, F1, ROC-AUC**
- Plot ROC curves and show a confusion matrix
- Inspect top positive/negative coefficients (log-odds)

**Dataset:** `Chronic_Kidney_Dsease_data.csv`  
**Original target:** `GFR`  → **Derived label:** `CKD_flag`

In [None]:
# 0) Configuration
DATA_PATH = "Chronic_Kidney_Dsease_data.csv"   # keep relative so peers can run from GitHub
GFR_COL = "GFR"
CKD_THRESHOLD = 60.0  # GFR < 60 → CKD present
TEST_SIZE = 0.2
CV_FOLDS = 5
RANDOM_STATE = 42

# If class imbalance is strong, using class_weight='balanced' can help recall at some cost to precision
USE_CLASS_WEIGHT = True  # set False if you don't want automatic balancing

# Results folder
OUT_DIR = "week4_outputs_ckd"

## 1) Imports

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, RocCurveDisplay
)

pd.set_option("display.max_columns", 200)
Path(OUT_DIR).mkdir(exist_ok=True)

## 2) Load data & create binary label (CKD_flag)

In [None]:
df = pd.read_csv(DATA_PATH)
assert GFR_COL in df.columns, f"Column '{GFR_COL}' not found. Available: {df.columns.tolist()[:25]}…"

# Create binary label: 1 = CKD present (GFR < 60), 0 = no CKD
df["CKD_flag"] = (df[GFR_COL].astype(float) < CKD_THRESHOLD).astype(int)

print("Loaded:", df.shape)
display(df.head(3))
print("\nClass balance (CKD_flag):\n", df["CKD_flag"].value_counts(normalize=True).rename(lambda x: f"class_{x}"))

# Define X, y
y = df["CKD_flag"]
X = df.drop(columns=["CKD_flag"])  # keep original GFR and other cols as features for now

# Identify types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Numerical: {len(num_cols)} | Categorical: {len(cat_cols)}")

## 3) Train / Test split (stratified)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print("Train:", X_train.shape, "| Test:", X_test.shape)
print("Train class balance:", y_train.mean().round(3), "(1=CKD)")

## 4) Preprocessing pipelines (with and without scaling)

In [None]:
# Shared imputers/encoders
num_impute = SimpleImputer(strategy="median")
cat_impute = SimpleImputer(strategy="most_frequent")
onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# A) No-scaling pipeline
pre_no_scale = ColumnTransformer([
    ("num", Pipeline([("imputer", num_impute)]), num_cols),
    ("cat", Pipeline([("imputer", cat_impute), ("onehot", onehot)]), cat_cols)
])

# B) With StandardScaler for numeric
pre_scaled = ColumnTransformer([
    ("num", Pipeline([("imputer", num_impute), ("scaler", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imputer", cat_impute), ("onehot", onehot)]), cat_cols)
])

class_weight = "balanced" if USE_CLASS_WEIGHT else None

# CV-based logistic regression (strongly recommended over fixed C)
def make_logregcv_pipeline(preprocessor):
    # liblinear handles l1/l2 on smaller data; saga good for large/sparse; we'll use liblinear for robustness
    return Pipeline([
        ("preprocess", preprocessor),
        ("model", LogisticRegressionCV(
            Cs=np.logspace(-3, 3, 10),
            cv=CV_FOLDS,
            penalty="l2",
            solver="liblinear",
            class_weight=class_weight,
            scoring="roc_auc",
            max_iter=1000,
            n_jobs=None
        ))
    ])

pipe_no_scale = make_logregcv_pipeline(pre_no_scale)
pipe_scaled   = make_logregcv_pipeline(pre_scaled)

## 5) Fit models & evaluate (Accuracy, Precision, Recall, F1, ROC-AUC)

In [None]:
def evaluate_classifier(name, pipe, X_tr, X_te, y_tr, y_te):
    pipe.fit(X_tr, y_tr)
    # Predictions
    yhat_tr = pipe.predict(X_tr)
    yhat_te = pipe.predict(X_te)
    # Probabilities for ROC-AUC
    proba_tr = pipe.predict_proba(X_tr)[:, 1]
    proba_te = pipe.predict_proba(X_te)[:, 1]

    def metrics(y_true, y_pred, proba):
        return dict(
            Accuracy=accuracy_score(y_true, y_pred),
            Precision=precision_score(y_true, y_pred, zero_division=0),
            Recall=recall_score(y_true, y_pred, zero_division=0),
            F1=f1_score(y_true, y_pred, zero_division=0),
            ROC_AUC=roc_auc_score(y_true, proba)
        )

    tr = metrics(y_tr, yhat_tr, proba_tr)
    te = metrics(y_te, yhat_te, proba_te)

    row = {
        "Model": name,
        **{f"train_{k}": v for k, v in tr.items()},
        **{f"test_{k}": v for k, v in te.items()}
    }
    return row, pipe

rows = []
row_ns, fitted_ns = evaluate_classifier("LogRegCV — No Scaling", pipe_no_scale, X_train, X_test, y_train, y_test)
rows.append(row_ns)

row_sc, fitted_sc = evaluate_classifier("LogRegCV — Scaled", pipe_scaled, X_train, X_test, y_train, y_test)
rows.append(row_sc)

results = pd.DataFrame(rows)
display(results)

# Save
results.to_csv(Path(OUT_DIR) / "week4_classification_results.csv", index=False)
print("Saved:", (Path(OUT_DIR) / "week4_classification_results.csv").resolve())

## 6) Classification report & confusion matrix (best model on test set)

In [None]:
# Pick best by test ROC-AUC; tie-breaker test F1
best_idx = results[["test_ROC_AUC", "test_F1"]].values.argmax(axis=0)[0]
best_row = results.iloc[results["test_ROC_AUC"].idxmax()]
best_name = best_row["Model"]
best_pipe = fitted_sc if "Scaled" in best_name else fitted_ns

y_pred = best_pipe.predict(X_test)
y_proba = best_pipe.predict_proba(X_test)[:, 1]

print(f"Best model: {best_name}\n")
print("Classification Report (Test):\n")
print(classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (Test):\n", cm)

# Simple ROC curve
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title(f"ROC Curve — {best_name}")
plt.show()

## 7) Coefficient inspection (log-odds, top positive/negative)

In [None]:
def get_feature_names(preprocessor, num_cols, cat_cols):
    # numeric
    num_names = preprocessor.named_transformers_["num"].get_feature_names_out(num_cols) \
        if hasattr(preprocessor.named_transformers_["num"], "get_feature_names_out") else np.array(num_cols)
    # categorical (from OneHot)
    if len(cat_cols) > 0 and hasattr(preprocessor.named_transformers_["cat"].named_steps["onehot"], "get_feature_names_out"):
        cat_names = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(cat_cols)
        return np.concatenate([num_names, cat_names])
    return num_names

# Use the best fitted pipeline
pre = best_pipe.named_steps["preprocess"]
model = best_pipe.named_steps["model"]
feat_names = get_feature_names(pre, num_cols, cat_cols)
coefs = model.coef_.ravel()

coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs})
coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False)

display(coef_df.head(20))

coef_df.to_csv(Path(OUT_DIR) / "week4_best_model_coefficients.csv", index=False)
print("Saved:", (Path(OUT_DIR) / "week4_best_model_coefficients.csv").resolve())

# Plot top +/- coefficients
topk = coef_df.head(15)
plt.figure(figsize=(9,6))
plt.barh(topk["feature"], topk["coef"])
plt.gca().invert_yaxis()
plt.title(f"Top Coefficients (log-odds) — {best_name}")
plt.xlabel("Coefficient")
plt.tight_layout()
plt.show()

## 8) Notes on interpretation
- Coefficients are **log-odds**: positive values increase odds of CKD (label=1), negative decrease odds.
- With scaling enabled, numeric features are on comparable scales, which often improves convergence and interpretability.
- If the class distribution is imbalanced, `class_weight='balanced'` boosts recall for the minority class, sometimes reducing precision.

## 9) Takeaways (fill in before submitting)
- **Which pipeline performed best (by test ROC-AUC / F1)?** Did scaling help?
- **Class imbalance:** Did `class_weight='balanced'` change Recall vs Precision?
- **Most influential features:** Which top coefficients made clinical sense?
- **Confusion matrix & ROC:** Any threshold tuning needed (precision–recall trade-off)?
- **Next steps:** Calibrated probabilities, threshold tuning, or trying non-linear classifiers (trees/GBM/SVM).