
# Project 1 — Parts c)–f) (Using Your Modules)
This notebook solves **c, d, e, f** using the provided modules in this repo:
`data.py`, `grad.py`, `metrics.py`, `models.py`, `plots.py`, `resampling.py`, `utils.py`.


In [1]:

import sys, types, os, importlib
sys.path.insert(0, "/mnt/data")

# Create a small on-disk package "Code" so models.py's import works
code_pkg = "/mnt/data/Code"
os.makedirs(code_pkg, exist_ok=True)
# Mirror metrics into Code/metrics.py if not already present
if not os.path.exists(os.path.join(code_pkg, "metrics.py")):
    with open("/mnt/data/metrics.py","r") as src, open(os.path.join(code_pkg,"metrics.py"),"w") as dst:
        dst.write(src.read())
with open(os.path.join(code_pkg, "__init__.py"),"w") as f:
    f.write("")

# Now import the provided modules (hard fail if anything is missing)
import data, grad, metrics, models, plots, resampling, utils
from importlib import reload
for m in (data, grad, metrics, models, plots, resampling, utils):
    reload(m)
print("Modules loaded:", [m.__name__ for m in (data, grad, metrics, models, plots, resampling, utils)])


OSError: [Errno 30] Read-only file system: '/mnt'

## Data & Features (from `data.py`)

In [None]:

import numpy as np
import matplotlib.pyplot as plt

# Generate data and polynomial features
x, y = data.make_data(n=600, noise_sd=0.05, seed=12)
degree = 15
X_full = data.build_features(x, degree=degree, include_bias=False)

# Split + scale (fit scaler only on train) and center y
X_tr_s, X_te_s, y_tr_c, y_te, scaler, y_mean = data.split_and_scale(X_full, y, test_size=0.3, random_state=1, center_y=True)

# We'll consistently use centered y for training and add back y_mean on predictions.
print("Shapes:", X_tr_s.shape, X_te_s.shape)



## c) Gradient Descent for OLS and Ridge (using `grad.py`)

We minimize MSE with:
- **OLS**: gradient `grad.grad_ols`  
- **Ridge**: gradient `grad.grad_ridge` with λ  
and the plain GD loop `grad.gd`.


In [None]:

def predict_from_theta(X, theta, y_mean):
    return (X @ theta).ravel() + y_mean

# Use all p features created (no bias term since we standardized features)
p = X_tr_s.shape[1]

# OLS via gradient descent
theta_ols = grad.gd(X_tr_s, y_tr_c, eta=5e-3, iters=6000, theta0=np.zeros(p), lam=None)
yhat_te_ols = predict_from_theta(X_te_s, theta_ols, y_mean)
mse_ols = metrics.mse(y_te, yhat_te_ols)
r2_ols  = metrics.r2(y_te, yhat_te_ols)

# Ridge via gradient descent (λ given)
lam = 1e-3
theta_ridge = grad.gd(X_tr_s, y_tr_c, eta=5e-3, iters=6000, theta0=np.zeros(p), lam=lam)
yhat_te_ridge = predict_from_theta(X_te_s, theta_ridge, y_mean)
mse_ridge = metrics.mse(y_te, yhat_te_ridge)
r2_ridge  = metrics.r2(y_te, yhat_te_ridge)

print(f"OLS  — Test MSE: {mse_ols:.5f}, R²: {r2_ols:.4f}")
print(f"Ridge (λ={lam}) — Test MSE: {mse_ridge:.5f}, R²: {r2_ridge:.4f}")



## d) Optimizers: Momentum, AdaGrad, RMSProp, Adam

We keep the same objectives but change the update rule. We'll reuse `grad.grad_ridge` for the gradient and implement the optimizers here.


In [None]:

def run_optimizer(X, y, lam, eta, iters, optimizer="sgd",
                  beta=0.9, eps=1e-8, rho=0.99, b1=0.9, b2=0.999, seed=0):
    n, p = X.shape
    rng = np.random.default_rng(seed)
    theta = np.zeros(p, float)
    v = np.zeros_like(theta)
    s = np.zeros_like(theta)
    m = np.zeros_like(theta)
    t = 0

    for k in range(iters):
        g = grad.grad_ridge(X, y, theta, lam) if lam is not None else grad.grad_ols(X, y, theta)
        t += 1
        if optimizer == "sgd":
            theta -= eta * g
        elif optimizer == "momentum":
            v = beta * v + (1 - beta) * g
            theta -= eta * v
        elif optimizer == "adagrad":
            s += g * g
            theta -= (eta / (np.sqrt(s) + eps)) * g
        elif optimizer == "rmsprop":
            s = rho * s + (1 - rho) * (g * g)
            theta -= (eta / (np.sqrt(s) + eps)) * g
        elif optimizer == "adam":
            m = b1 * m + (1 - b1) * g
            s = b2 * s + (1 - b2) * (g * g)
            m_hat = m / (1 - b1**t)
            s_hat = s / (1 - b2**t)
            theta -= eta * m_hat / (np.sqrt(s_hat) + eps)
        else:
            raise ValueError("Unknown optimizer")
    return theta

opts = ["sgd","momentum","adagrad","rmsprop","adam"]
results = {}
lam_d = 1e-3
for opt in opts:
    th = run_optimizer(X_tr_s, y_tr_c, lam=lam_d, eta=5e-3, iters=5000, optimizer=opt)
    yhat = predict_from_theta(X_te_s, th, y_mean)
    results[opt] = (metrics.mse(y_te, yhat), metrics.r2(y_te, yhat))

for k,(mse_v,r2_v) in results.items():
    print(f"{k:8s}  MSE={mse_v:.5f}  R²={r2_v:.4f}")


In [None]:

# Simple bar plot of test MSE across optimizers
vals = [results[k][0] for k in opts]
import matplotlib.pyplot as plt
plt.figure()
plt.bar(opts, vals)
plt.ylabel("Test MSE")
plt.title("Ridge + different optimizers (GD variants)");



## e) LASSO via Proximal Gradient (ISTA)

LASSO isn't implemented in the provided modules, so we implement the solver here and evaluate with `metrics.py`.


In [None]:

def soft_threshold(u, tau):
    return np.sign(u) * np.maximum(np.abs(u) - tau, 0.0)

def ista_lasso(X, y, lam=1e-3, iters=6000, eta=None, seed=0):
    n, p = X.shape
    if eta is None:
        # conservative step via power iteration for spectral norm of X
        v = np.random.default_rng(seed).normal(size=p)
        for _ in range(20):
            v = X.T @ (X @ v)
            v /= (np.linalg.norm(v) + 1e-12)
        L = (2.0 / n) * (np.linalg.norm(X @ v)**2)
        eta = 1.0 / (L + 1e-12)
    theta = np.zeros(p)
    for _ in range(iters):
        g = (X.T @ (X @ theta - y)) * (2.0 / n)
        theta = soft_threshold(theta - eta * g, eta * lam)
    return theta

lam_l1 = 1e-3
theta_lasso = ista_lasso(X_tr_s, y_tr_c, lam=lam_l1, iters=8000)
yhat_lasso = predict_from_theta(X_te_s, theta_lasso, y_mean)
mse_lasso = metrics.mse(y_te, yhat_lasso)
r2_lasso  = metrics.r2(y_te, yhat_lasso)
print(f"LASSO (ISTA, λ={lam_l1}) — Test MSE: {mse_lasso:.5f}, R²: {r2_lasso:.4f}")



## f) Stochastic Gradient Descent (mini-batch) variants

Mini-batch versions of the optimizers for Ridge. We reuse `grad.grad_ridge`.


In [None]:

def sgd_minibatch(X, y, lam=1e-3, epochs=40, batch_size=64, eta=1e-2, optimizer="adam",
                  beta=0.9, eps=1e-8, rho=0.99, b1=0.9, b2=0.999, seed=0):
    n, p = X.shape
    rng = np.random.default_rng(seed)
    theta = np.zeros(p, float)
    v = np.zeros_like(theta)
    s = np.zeros_like(theta)
    m = np.zeros_like(theta)
    t = 0
    for ep in range(epochs):
        idx = rng.permutation(n)
        for start in range(0, n, batch_size):
            b = idx[start:start+batch_size]
            Xb, yb = X[b], y[b]
            # gradient on batch
            nb = Xb.shape[0]
            r = Xb @ theta - yb
            g = (Xb.T @ r) * (2.0 / nb) + lam * 2.0 * theta
            t += 1
            if optimizer == "sgd":
                theta -= eta * g
            elif optimizer == "momentum":
                v = beta * v + (1 - beta) * g
                theta -= eta * v
            elif optimizer == "adagrad":
                s += g * g
                theta -= (eta / (np.sqrt(s) + eps)) * g
            elif optimizer == "rmsprop":
                s = rho * s + (1 - rho) * (g * g)
                theta -= (eta / (np.sqrt(s) + eps)) * g
            elif optimizer == "adam":
                m = b1 * m + (1 - b1) * g
                s = b2 * s + (1 - b2) * (g * g)
                m_hat = m / (1 - b1**t)
                s_hat = s / (1 - b2**t)
                theta -= eta * m_hat / (np.sqrt(s_hat) + eps)
            else:
                raise ValueError("Unknown optimizer")
    return theta

theta_full = run_optimizer(X_tr_s, y_tr_c, lam=1e-3, eta=5e-3, iters=4000, optimizer="adam")
theta_sgd  = sgd_minibatch(X_tr_s, y_tr_c, lam=1e-3, epochs=25, batch_size=64, eta=1e-2, optimizer="adam")

yhat_full = predict_from_theta(X_te_s, theta_full, y_mean)
yhat_sgd  = predict_from_theta(X_te_s, theta_sgd,  y_mean)

mse_full = metrics.mse(y_te, yhat_full)
mse_sgd  = metrics.mse(y_te, yhat_sgd)

import matplotlib.pyplot as plt
plt.figure()
plt.bar(["Full-batch Adam", "Mini-batch Adam"], [mse_full, mse_sgd])
plt.ylabel("Test MSE"); plt.title("Ridge: Full-batch vs Mini-batch");
print("MSE (full-batch Adam):", mse_full, "  MSE (mini-batch Adam):", mse_sgd)



### Notes
- `data.py` handles **feature construction**, **train/test split**, **scaling**, and **centering**.  
- `grad.py` supplies the **OLS/Ridge gradients** and a **plain GD loop** used in part (c).  
- `metrics.py` provides **MSE** and **R²** used across parts (c)–(f).  
- `models.py` is kept for reference (closed-form fits) but the GD/SGD experiments here intentionally use gradients.  
- `plots.py` has helpers for specific sweeps; we did ad‑hoc plots tailored to GD/SGD results.
