In [2]:
# ---- imports & data ----
import numpy as np
import matplotlib.pyplot as plt
import sys, pathlib

# Project-1 folder (parent of this Notebooks/ folder)
proj_dir = pathlib.Path.cwd().parent
if str(proj_dir) not in sys.path:
    sys.path.insert(0, str(proj_dir))

# Figures folder
fig_dir = proj_dir / "Figures"
fig_dir.mkdir(parents=True, exist_ok=True)

from Code.data import make_data, build_features, split_and_scale
from Code.metrics import mse, r2

# Repro
rng = np.random.default_rng(2025)

# Data + features once
X_raw, y_raw = make_data(n=200, noise_sd=0.1, seed=2025)
Phi = build_features(X_raw, degree=20, include_bias=False)   # big basis; we'll slice p later

# One split/scale (returns scaled features and centered y)
X_tr_s, X_te_s, y_tr_c, y_te, scaler, y_mean = split_and_scale(Phi, y_raw, test_size=0.2, random_state=1, center_y=True)

n_tr, p_full = X_tr_s.shape
print(X_tr_s.shape, X_te_s.shape)


(160, 20) (40, 20)


In [3]:
def grad_ols(X, y_c, theta):
    # ∇ (1/2n)||y_c - Xθ||^2 = Xᵀ(Xθ - y_c)/n
    r = X @ theta - y_c
    return (X.T @ r) / X.shape[0]

def grad_ridge(X, y_c, theta, lam):
    # OLS grad + λθ (intercept not in θ; y is centered)
    return grad_ols(X, y_c, theta) + lam * theta


In [4]:
def gd_full(X, y_c, theta0, eta, iters, grad_fn, **grad_kw):
    """Full-batch GD. Returns θ path and train-loss list."""
    theta = theta0.copy()
    thetas, losses = [], []
    for t in range(iters):
        g = grad_fn(X, y_c, theta, **grad_kw)
        theta -= eta * g
        thetas.append(theta.copy())
        losses.append(0.5*np.mean((y_c - X @ theta)**2))
    return np.array(thetas), np.array(losses)

def gd_full_opt(X, y_c, theta0, iters, grad_fn, opt="momentum", eta=1e-2, **kw):
    """Full-batch GD with optimizer states."""
    theta = theta0.copy()
    v = np.zeros_like(theta)         # momentum / rmsprop / adam first moment
    s = np.zeros_like(theta)         # adagrad accumulator / rmsprop EMA / adam second moment
    thetas, losses = [], []

    eps = 1e-8
    beta1, beta2 = 0.9, 0.999
    for t in range(1, iters+1):
        g = grad_fn(X, y_c, theta, **kw)

        if opt == "momentum":
            v = 0.9*v + g
            step = eta * v

        elif opt == "adagrad":
            s += g*g
            step = eta * g / (np.sqrt(s) + eps)

        elif opt == "rmsprop":
            s = 0.9*s + 0.1*(g*g)
            step = eta * g / (np.sqrt(s) + eps)

        elif opt == "adam":
            v = beta1*v + (1-beta1)*g
            s = beta2*s + (1-beta2)*(g*g)
            v_hat = v / (1 - beta1**t)
            s_hat = s / (1 - beta2**t)
            step = eta * v_hat / (np.sqrt(s_hat) + eps)

        else:  # plain GD
            step = eta * g

        theta -= step
        thetas.append(theta.copy())
        losses.append(0.5*np.mean((y_c - X @ theta)**2))
    return np.array(thetas), np.array(losses)

def sgd_minibatch(X, y_c, theta0, eta, iters, batch_size, grad_fn, **grad_kw):
    """Mini-batch SGD (no momentum for brevity; combine with gd_full_opt if you like)."""
    theta = theta0.copy()
    n = X.shape[0]
    thetas, losses = [], []
    for _ in range(iters):
        idx = rng.choice(n, size=batch_size, replace=False)
        Xb, yb = X[idx], y_c[idx]
        g = grad_fn(Xb, yb, theta, **grad_kw)
        theta -= eta * g
        thetas.append(theta.copy())
        losses.append(0.5*np.mean((y_c - X @ theta)**2))  # evaluate on full train
    return np.array(thetas), np.array(losses)
