# Week 2 — Linear Regression 2 

This notebook applies Linear Regression (OLS, Ridge, Lasso, ElasticNet) to the CKD dataset and explores degree-2 polynomial features.


## 1) Setup & Data Load

In [2]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

DATA_PATH = 'Chronic_Kidney_Dsease_data.csv'
TARGET_COL = 'SerumCreatinine'  

# Load
df = pd.read_csv(DATA_PATH)
df.columns = [str(c).strip() for c in df.columns]

def coerce_numeric(s):
    return pd.to_numeric(s.replace({'?': np.nan, 'NA': np.nan, 'None': np.nan, 'na': np.nan, '': np.nan}), errors='coerce')

# Convert object columns that are mostly numeric-looking
for c in df.columns:
    if df[c].dtype == object:
        z = coerce_numeric(df[c])
        if np.isfinite(z).mean() > 0.6:
            df[c] = z

# Split features/target
assert TARGET_COL in df.columns, f"'{TARGET_COL}' not found in dataframe."
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Column typing
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

# Preprocessing
numeric_pre = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_pre = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

pre = ColumnTransformer([
    ('num', numeric_pre, num_cols),
    ('cat', categorical_pre, cat_cols),
])

# CV & metrics
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'rmse': 'neg_root_mean_squared_error',
    'mae' : 'neg_mean_absolute_error',
    'r2'  : 'r2'
}

print('Shape:', df.shape)
df.head(8)

Shape: (1659, 54)


Unnamed: 0,PatientID,Age,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,Itching,QualityOfLifeScore,HeavyMetalsExposure,OccupationalExposureChemicals,WaterQuality,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy,Diagnosis,DoctorInCharge
0,1,71,0,0,0,2,31.069414,1,5.128112,1.67622,...,7.556302,76.0768,0,0,1,1.018824,4.966808,9.871449,1,Confidential
1,2,34,0,0,1,3,29.692119,1,18.609552,8.377574,...,6.836766,40.128498,0,0,0,3.923538,8.189275,7.161765,1,Confidential
2,3,80,1,1,0,1,37.394822,1,11.882429,9.607401,...,2.144722,92.872842,0,1,1,1.429906,7.624028,7.354632,1,Confidential
3,4,40,0,2,0,1,31.32968,0,16.020165,0.408871,...,7.077188,90.080321,0,0,0,3.226416,3.282688,6.629587,1,Confidential
4,5,43,0,1,1,2,23.726311,0,7.944146,0.780319,...,3.553118,5.258372,0,0,1,0.285466,3.849498,1.437385,1,Confidential
5,6,22,0,0,0,1,39.155643,0,4.243608,8.847245,...,8.685695,12.79411,0,0,0,0.358613,5.766704,2.066944,1,Confidential
6,7,41,0,1,0,1,35.040487,0,18.224708,8.155523,...,1.995016,38.72303,0,0,1,2.744605,5.51576,3.856676,1,Confidential
7,8,72,1,0,1,3,30.76044,1,18.662717,6.179345,...,2.178408,92.939765,0,0,0,1.828836,8.620466,9.95467,0,Confidential


## 2) Baselines: OLS, Ridge, Lasso, ElasticNet (5-fold CV)

In [3]:
import numpy as np

results = {}

def eval_model(name, model):
    pipe = Pipeline([('pre', pre), ('model', model)])
    cvres = cross_validate(pipe, X, y, scoring=scoring, cv=cv, return_train_score=False)
    results[name] = {
        'RMSE_mean': -cvres['test_rmse'].mean(),
        'RMSE_std' :  cvres['test_rmse'].std(),
        'MAE_mean' : -cvres['test_mae'].mean(),
        'MAE_std'  :  cvres['test_mae'].std(),
        'R2_mean'  :  cvres['test_r2'].mean(),
        'R2_std'   :  cvres['test_r2'].std(),
    }

ridge_alphas = np.logspace(-3, 3, 25)

eval_model('LinearRegression', LinearRegression())
eval_model('RidgeCV', RidgeCV(alphas=ridge_alphas))
eval_model('LassoCV', LassoCV(cv=cv, random_state=42, max_iter=10000))
eval_model('ElasticNetCV', ElasticNetCV(l1_ratio=[.2,.4,.6,.8,.9,.95,.99,1.0], cv=cv, random_state=42, max_iter=10000))

baseline_df = pd.DataFrame(results).T.sort_values('RMSE_mean')
baseline_df.round(4)

Unnamed: 0,RMSE_mean,RMSE_std,MAE_mean,MAE_std,R2_mean,R2_std
ElasticNetCV,1.2918,0.0404,1.1207,0.0431,0.036,0.0104
LassoCV,1.2918,0.0404,1.1207,0.0431,0.036,0.0104
RidgeCV,1.3034,0.0386,1.132,0.0427,0.0184,0.0069
LinearRegression,1.3128,0.0379,1.1282,0.0423,0.0041,0.0109


## 3) Polynomial Features (degree 2, numeric only)

In [4]:
numeric_poly = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

pre_poly = ColumnTransformer([
    ('num', numeric_poly, num_cols),
    ('cat', categorical_pre, cat_cols),
])

poly_results = {}

def eval_model_poly(name, model):
    pipe = Pipeline([('pre', pre_poly), ('model', model)])
    cvres = cross_validate(pipe, X, y, scoring=scoring, cv=cv, return_train_score=False)
    poly_results[name] = {
        'RMSE_mean': -cvres['test_rmse'].mean(),
        'RMSE_std' :  cvres['test_rmse'].std(),
        'MAE_mean' : -cvres['test_mae'].mean(),
        'MAE_std'  :  cvres['test_mae'].std(),
        'R2_mean'  :  cvres['test_r2'].mean(),
        'R2_std'   :  cvres['test_r2'].std(),
    }


eval_model_poly('LinearRegression+Poly2', LinearRegression())
eval_model_poly('RidgeCV+Poly2', RidgeCV(alphas=ridge_alphas))
eval_model_poly('LassoCV+Poly2', LassoCV(cv=cv, random_state=42, max_iter=10000))
eval_model_poly('ElasticNetCV+Poly2', ElasticNetCV(l1_ratio=[.2,.4,.6,.8,.9,.95,.99,1.0], cv=cv, random_state=42, max_iter=10000))

poly_df = pd.DataFrame(poly_results).T.sort_values('RMSE_mean')
poly_df.round(4)

combined = pd.concat([baseline_df, poly_df]).sort_values('RMSE_mean')
combined.round(4)

Unnamed: 0,RMSE_mean,RMSE_std,MAE_mean,MAE_std,R2_mean,R2_std
LassoCV+Poly2,1.291,0.0401,1.1196,0.0418,0.0371,0.0116
ElasticNetCV+Poly2,1.291,0.0401,1.1196,0.0418,0.0371,0.0116
ElasticNetCV,1.2918,0.0404,1.1207,0.0431,0.036,0.0104
LassoCV,1.2918,0.0404,1.1207,0.0431,0.036,0.0104
RidgeCV,1.3034,0.0386,1.132,0.0427,0.0184,0.0069
LinearRegression,1.3128,0.0379,1.1282,0.0423,0.0041,0.0109
RidgeCV+Poly2,1.4133,0.0339,1.193,0.0504,-0.1557,0.0553
LinearRegression+Poly2,5.6193,0.6023,4.4588,0.5108,-17.6972,5.3291


## 4) Holdout & Residual Diagnostics

In [5]:
best_name = combined.index[0]
use_poly = 'Poly2' in best_name

if use_poly:
    use_pre = pre_poly
else:
    use_pre = pre

if 'Ridge' in best_name:
    best_est = RidgeCV(alphas=ridge_alphas)
elif 'Lasso' in best_name:
    best_est = LassoCV(cv=cv, random_state=42, max_iter=10000)
elif 'ElasticNet' in best_name:
    best_est = ElasticNetCV(l1_ratio=[.2,.4,.6,.8,.9,.95,.99,1.0], cv=cv, random_state=42, max_iter=10000)
else:
    best_est = LinearRegression()

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=7)
pipe_best = Pipeline([('pre', use_pre), ('model', best_est)]).fit(X_tr, y_tr)
y_hat = pipe_best.predict(X_te)

rmse = mean_squared_error(y_te, y_hat, squared=False)
mae  = mean_absolute_error(y_te, y_hat)
r2   = r2_score(y_te, y_hat)

print('Best by CV:', best_name)
print(f'Holdout RMSE: {rmse:.4f}')
print(f'Holdout MAE : {mae:.4f}')
print(f'Holdout R^2 : {r2:.4f}')

# Residuals vs Fitted
resid = y_te - y_hat
plt.figure()
plt.scatter(y_hat, resid, alpha=0.7)
plt.axhline(0, linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted — ' + best_name)
plt.show()

# Simple Q–Q without seaborn/scipy
res_sorted = np.sort(resid)
n = len(res_sorted)
probs = (np.arange(1, n+1) - 0.5) / n
theoretical = np.sqrt(2) * np.erfinv(2*probs - 1)
z_res = (res_sorted - res_sorted.mean()) / res_sorted.std(ddof=1)

plt.figure()
plt.scatter(theoretical, z_res, alpha=0.7)
mn, mx = theoretical.min(), theoretical.max()
plt.plot([mn, mx], [mn, mx], linestyle='--')
plt.xlabel('Theoretical Quantiles (Z)')
plt.ylabel('Standardized Residual Quantiles')
plt.title('Q–Q Plot — ' + best_name)
plt.show()

TypeError: got an unexpected keyword argument 'squared'

## 5) Notes (edit this block)

- **Best model (by CV RMSE):** _(fill from `combined` table)_  
- **Holdout metrics:** RMSE≈…, MAE≈…, R²≈…  
- **Polynomial features:** Helped/hurt depending on interaction strength; regularization controlled variance.  
- **Assumptions:** Residuals vs fitted suggest _[approx/non]_ constant variance; Q–Q suggests _[approx/non]_ normality.  
- **Next steps:** Try log transforms for skewed labs, targeted interactions, and compare to tree-based models in Week 6.  
