# Week 3 — Linear Regression III on CKD Dataset
**Forward/Backward Selection, PCR, and PLSR**  

This notebook extends Week 2 by adding:
- Forward & backward stepwise selection
- PCR (Principal Components Regression)
- PLSR (Partial Least Squares Regression)

Dataset: **Chronic_Kidney_Dsease_data.csv**  
Target: **GFR** (log-transform optional)


In [None]:
DATA_PATH = "Chronic_Kidney_Dsease_data.csv"
TARGET = "GFR"
TEST_SIZE = 0.2
CV_FOLDS = 5
RANDOM_STATE = 42
LOG_TARGET = False

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

try:
    from sklearn.feature_selection import SequentialFeatureSelector
    SKLEARN_SFS = True
except:
    SKLEARN_SFS = False

pd.set_option("display.max_columns", 200)

In [None]:
df = pd.read_csv(DATA_PATH)
print("Loaded:", df.shape)
print(df.head())

y_raw = df[TARGET].astype(float)
X = df.drop(columns=[TARGET])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

y = np.log1p(y_raw) if LOG_TARGET else y_raw

In [None]:
numeric_pre = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pre = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer([
    ("num", numeric_pre, num_cols),
    ("cat", categorical_pre, cat_cols)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [None]:
def evaluate(y_true, y_pred, log_target=False):
    if log_target:
        y_true = np.expm1(y_true)
        y_pred = np.expm1(y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

def summarize(name, model):
    tr = model.predict(X_train)
    te = model.predict(X_test)
    mae_tr, rmse_tr, r2_tr = evaluate(y_train, tr, LOG_TARGET)
    mae_te, rmse_te, r2_te = evaluate(y_test, te, LOG_TARGET)
    return dict(Model=name,
                MAE_train=mae_tr, RMSE_train=rmse_tr, R2_train=r2_tr,
                MAE_test=mae_te, RMSE_test=rmse_te, R2_test=r2_te)

In [None]:
ols = Pipeline([("preprocess", preprocess),
                ("model", LinearRegression())])
ols.fit(X_train, y_train)
ols_results = summarize("OLS", ols)
pd.DataFrame([ols_results])

In [None]:
# Stepwise feature selection (forward)
if SKLEARN_SFS:
    sfs_fwd = SequentialFeatureSelector(LinearRegression(),
                                       n_features_to_select=5,
                                       direction="forward", cv=CV_FOLDS)
    sfs_fwd.fit(pd.get_dummies(X_train), y_train)
    fwd_features = pd.get_dummies(X_train).columns[sfs_fwd.get_support()].tolist()
    print("Forward features:", fwd_features[:10])

In [None]:
pcr_pipe = Pipeline([("preprocess", preprocess),
                     ("pca", PCA()),
                     ("model", LinearRegression())])

param = {"pca__n_components": list(range(2, min(15, X_train.shape[1])))}
pcr_gs = GridSearchCV(pcr_pipe, param, cv=CV_FOLDS, scoring="neg_mean_squared_error")
pcr_gs.fit(X_train, y_train)

pcr_results = summarize("PCR", pcr_gs.best_estimator_)
pd.DataFrame([ols_results, pcr_results])

In [None]:
pls = Pipeline([("preprocess", preprocess),
                ("model", PLSRegression())])

param = {"model__n_components": list(range(2, min(15, X_train.shape[1])))}
pls_gs = GridSearchCV(pls, param, cv=CV_FOLDS, scoring="neg_mean_squared_error")
pls_gs.fit(X_train, y_train)

pls_results = summarize("PLSR", pls_gs.best_estimator_)
pd.DataFrame([ols_results, pcr_results, pls_results])

## Takeaways
- Which model generalized best?  
- Did PCR/PLSR help?  
- Did stepwise improve interpretability?  
- Any residual patterns?  
- Next steps (Week 4+: classification, nonlinear models).
