# Week 2 — Linear Regression II on CKD Dataset
**Regularization: Ridge, Lasso, Elastic Net**  
**Created:** 2025-09-22 00:29

This notebook applies regularized linear models to the uploaded **Chronic_Kidney_Dsease_data.csv** to predict **GFR**.

**Models**
- OLS baseline
- RidgeCV
- LassoCV
- ElasticNetCV

It includes cross‑validated metrics, residual diagnostics, and coefficient inspection. Fill in the **Takeaways** at the end for peer review.


## 0) Configuration

In [None]:
# Paths & settings
DATA_PATH = "/mnt/data/Chronic_Kidney_Dsease_data.csv"
TARGET = "GFR"           # continuous target for regression
TEST_SIZE = 0.2
CV_FOLDS = 5
RANDOM_STATE = 42

# Columns to drop (IDs / leakage)
DROP_COLS = ["PatientID"]

# Optionally log-transform target if heavily right-skewed
LOG_TARGET = False  # set True after inspecting target distribution


## 1) Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display

pd.set_option("display.max_columns", 200)


## 2) Load & preview data

In [None]:
path = Path(DATA_PATH)
assert path.exists(), f"Not found: {path}"
df = pd.read_csv(path)

# Drop non-features
for c in DROP_COLS:
    if c in df.columns:
        df = df.drop(columns=c)

print("✅ Loaded:", df.shape)
display(df.head(3))
print("\nNulls per column (top 20):")
print(df.isna().sum().sort_values(ascending=False).head(20))

# Quick target peek
assert TARGET in df.columns, f"TARGET '{TARGET}' not found."
plt.figure(figsize=(6,4))
plt.hist(df[TARGET].dropna(), bins=30)
plt.title(f"Target distribution: {TARGET}")
plt.xlabel(TARGET); plt.ylabel("Count")
plt.show()


## 3) Define features & target

In [None]:
y_raw = df[TARGET].astype(float)
X = df.drop(columns=[TARGET])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numerical: {len(num_cols)} | Categorical: {len(cat_cols)}")

y = np.log1p(y_raw) if LOG_TARGET else y_raw


## 4) Preprocessing pipelines

In [None]:
numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, num_cols),
        ("cat", categorical_pre, cat_cols),
    ],
    remainder="drop"
)


## 5) Train / Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print("Train:", X_train.shape, "| Test:", X_test.shape)


## 6) Helpers: metrics & residuals

In [None]:
def evaluate_predictions(y_true, y_pred, log_target=False):
    if log_target:
        y_true_lin = np.expm1(y_true)
        y_pred_lin = np.expm1(y_pred)
        mae = mean_absolute_error(y_true_lin, y_pred_lin)
        rmse = mean_squared_error(y_true_lin, y_pred_lin, squared=False)
        r2 = r2_score(y_true_lin, y_pred_lin)
    else:
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

def residual_plot(y_true, y_pred, title="Residuals vs Predictions"):
    residuals = y_true - y_pred
    plt.figure(figsize=(6,4))
    plt.scatter(y_pred, residuals, alpha=0.5)
    plt.axhline(0, linestyle="--")
    plt.xlabel("Predictions")
    plt.ylabel("Residuals")
    plt.title(title)
    plt.show()


## 7) Baseline: OLS

In [None]:
ols = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])
ols.fit(X_train, y_train)

ols_cv = cross_val_score(ols, X_train, y_train, cv=CV_FOLDS, scoring="neg_mean_squared_error")
cv_rmse = (-ols_cv.mean())**0.5
print(f"OLS CV RMSE: {cv_rmse:.4f} (log-space if LOG_TARGET=True)")

pred_tr = ols.predict(X_train); pred_te = ols.predict(X_test)
ols_results = dict(
    Model="OLS",
    MAE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[0],
    RMSE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[1],
    R2_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[2],
    MAE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[0],
    RMSE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[1],
    R2_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[2],
)
import pandas as pd
pd.DataFrame([ols_results])


## 8) Regularized models (CV)

In [None]:
alphas = np.logspace(-3, 3, 60)


### 8.1 RidgeCV

In [None]:
ridge = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RidgeCV(alphas=alphas, cv=CV_FOLDS, scoring="neg_mean_squared_error"))
])
ridge.fit(X_train, y_train)
print("Best alpha (Ridge):", ridge.named_steps["model"].alpha_)

pred_tr = ridge.predict(X_train); pred_te = ridge.predict(X_test)
ridge_results = dict(
    Model="RidgeCV",
    MAE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[0],
    RMSE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[1],
    R2_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[2],
    MAE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[0],
    RMSE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[1],
    R2_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[2],
)
pd.DataFrame([ols_results, ridge_results])


### 8.2 LassoCV

In [None]:
lasso = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LassoCV(alphas=alphas, cv=CV_FOLDS, random_state=RANDOM_STATE, max_iter=20000))
])
lasso.fit(X_train, y_train)
print("Best alpha (Lasso):", lasso.named_steps["model"].alpha_)

pred_tr = lasso.predict(X_train); pred_te = lasso.predict(X_test)
lasso_results = dict(
    Model="LassoCV",
    MAE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[0],
    RMSE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[1],
    R2_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[2],
    MAE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[0],
    RMSE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[1],
    R2_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[2],
)
pd.DataFrame([ols_results, ridge_results, lasso_results])


### 8.3 ElasticNetCV

In [None]:
elastic = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", ElasticNetCV(alphas=alphas, l1_ratio=[.2,.4,.6,.8,.9,.95,.99,1.0],
                           cv=CV_FOLDS, random_state=RANDOM_STATE, max_iter=20000))
])
elastic.fit(X_train, y_train)
print("Best alpha (ElasticNet):", elastic.named_steps["model"].alpha_)
print("Best l1_ratio:", elastic.named_steps["model"].l1_ratio_)

pred_tr = elastic.predict(X_train); pred_te = elastic.predict(X_test)
elastic_results = dict(
    Model="ElasticNetCV",
    MAE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[0],
    RMSE_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[1],
    R2_train=evaluate_predictions(y_train, pred_tr, LOG_TARGET)[2],
    MAE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[0],
    RMSE_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[1],
    R2_test=evaluate_predictions(y_test, pred_te, LOG_TARGET)[2],
)
results = pd.DataFrame([ols_results, ridge_results, lasso_results, elastic_results])
results


## 9) Residual diagnostics (test set)

In [None]:
for name, pipe in [("OLS", ols), ("Ridge", ridge), ("Lasso", lasso), ("ElasticNet", elastic)]:
    preds = pipe.predict(X_test)
    residual_plot(y_test, preds, title=f"Residuals vs Predictions — {name} (log-target={LOG_TARGET})")


## 10) Coefficient inspection

In [None]:
def get_feature_names(preprocessor, num_cols, cat_cols):
    num_names = preprocessor.named_transformers_["num"].get_feature_names_out(num_cols)
    if len(cat_cols) > 0:
        cat_names = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(cat_cols)
        return np.concatenate([num_names, cat_names])
    return num_names

def extract_coefficients(pipeline, label):
    pre = pipeline.named_steps["preprocess"]
    feats = get_feature_names(pre, num_cols, cat_cols)
    model = pipeline.named_steps["model"]
    coefs = getattr(model, "coef_", None)
    return pd.DataFrame({"feature": feats, label: coefs})

coef_ols = extract_coefficients(ols, "OLS")
coef_ridge = extract_coefficients(ridge, "Ridge")
coef_lasso = extract_coefficients(lasso, "Lasso")
coef_elastic = extract_coefficients(elastic, "ElasticNet")

coef_df = (coef_ols.merge(coef_ridge, on="feature", how="outer")
                    .merge(coef_lasso, on="feature", how="outer")
                    .merge(coef_elastic, on="feature", how="outer")).fillna(0)

# Top absolute coefficients by ElasticNet
coef_df["abs_elastic"] = coef_df["ElasticNet"].abs()
topk = coef_df.sort_values("abs_elastic", ascending=False).head(20)
display(topk)

plt.figure(figsize=(8,6))
plt.barh(topk["feature"], topk["ElasticNet"])
plt.gca().invert_yaxis()
plt.title("Top ElasticNet Coefficients (absolute)")
plt.xlabel("Coefficient")
plt.tight_layout()
plt.show()


## 11) Save artifacts

In [None]:
out_dir = Path("week2_outputs_ckd")
out_dir.mkdir(exist_ok=True)
results.to_csv(out_dir / "model_results.csv", index=False)
coef_df.to_csv(out_dir / "coefficients_all_models.csv", index=False)
print("Saved:", list(out_dir.glob("*.csv")))


## 12) Takeaways (fill in before submitting)
- **Which model generalized best** on **test RMSE** and **R²**? Why might that be?
- **Regularization effects:** Any coefficients reduced to ~0 (Lasso)? Did Ridge/ElasticNet stabilize weights?
- **Overfitting controls:** CV results vs. test set — any gaps?
- **Residuals:** Any patterns (nonlinearity/heteroscedasticity) to address next?
- **Next steps:** Feature engineering ideas (e.g., interactions, nonlinearity), or alternative models.
