<a href="https://colab.research.google.com/github/asupraja3/ml-ng-notebooks/blob/main/FeatureScaling_and_LearningRate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Optional Lab: Feature Scaling and Learning Rate (Multi‑variable)

This self‑contained notebook shows:
- How to **scale features** (min‑max, mean normalization, **z‑score**)
- How scaling affects **gradient descent convergence**
- How to choose/tune the **learning rate (α)** with visual diagnostics

Libraries: `numpy`, `matplotlib` (no external datasets required).


In [None]:

import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)


## 1) Create a synthetic multi‑feature dataset (house pricing style)

In [None]:

# Features: [size(sqft), bedrooms, age(years), floors]
rng = np.random.default_rng(0)
m = 300

size = rng.uniform(600, 3500, m)         # large numeric range
beds = rng.integers(1, 6, m)             # small integer range
age  = rng.uniform(0, 40, m)             # medium range
floors = rng.integers(1, 4, m)           # small range

X_train = np.c_[size, beds, age, floors].astype(float)

# Ground-truth linear model (unknown to learner)
true_w = np.array([220.0, 30.0, -2.0, 15.0])
true_b = 50_000.0

noise = rng.normal(0, 25_000, m)
y_train = X_train @ true_w + true_b + noise

X_features = ["size(sqft)", "bedrooms", "age", "floors"]
X_train[:3], y_train[:3]


## 2) Visualize feature distributions (before/after scaling)

In [None]:

def norm_plot(ax, x, bins=30):
    """Plot histogram and overlay a normal pdf with same mean/std."""
    mu, sigma = np.mean(x), np.std(x)
    ax.hist(x, bins=bins, alpha=0.6, density=True)
    # overlay normal curve
    xs = np.linspace(mu - 4*sigma, mu + 4*sigma, 200)
    if sigma > 1e-12:
        pdf = (1.0/(sigma*np.sqrt(2*np.pi))) * np.exp(-0.5*((xs-mu)/sigma)**2)
        ax.plot(xs, pdf, linewidth=2)
    ax.grid(True, alpha=0.3)
    return mu, sigma


## 3) Scaling utilities (min‑max, mean normalization, z‑score)

In [None]:

def minmax_scale(X):
    X = X.astype(float)
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    # avoid division by zero
    denom = np.where(X_max - X_min == 0, 1.0, X_max - X_min)
    X_s = (X - X_min) / denom
    return X_s, X_min, X_max

def mean_normalize(X):
    X = X.astype(float)
    mu = X.mean(axis=0)
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    denom = np.where(X_max - X_min == 0, 1.0, X_max - X_min)
    X_s = (X - mu) / denom
    return X_s, mu, (X_min, X_max)

def zscore_normalize(X):
    X = X.astype(float)
    mu = X.mean(axis=0)
    sigma = X.std(axis=0, ddof=0)
    sigma = np.where(sigma == 0, 1.0, sigma)
    X_s = (X - mu) / sigma
    return X_s, mu, sigma


In [None]:

# Plot distributions *before* normalization
fig, ax = plt.subplots(1, X_train.shape[1], figsize=(14, 3))
for i in range(X_train.shape[1]):
    norm_plot(ax[i], X_train[:, i])
    ax[i].set_xlabel(X_features[i])
ax[0].set_ylabel("density")
fig.suptitle("Distributions BEFORE normalization")
plt.show()

# Z-score normalize
X_norm, mu, sigma = zscore_normalize(X_train)

# Plot distributions *after* normalization
fig, ax = plt.subplots(1, X_norm.shape[1], figsize=(14, 3))
for i in range(X_norm.shape[1]):
    norm_plot(ax[i], X_norm[:, i])
    ax[i].set_xlabel(X_features[i])
ax[0].set_ylabel("density")
fig.suptitle("Distributions AFTER z-score normalization")
plt.show()


## 4) Linear regression helpers (cost, gradient, GD)

In [None]:

def predict(X, w, b):
    return X @ w + b

def compute_cost(X, y, w, b):
    m = X.shape[0]
    errors = X @ w + b - y
    return (errors @ errors) / (2*m)

def compute_gradient(X, y, w, b):
    m = X.shape[0]
    errors = X @ w + b - y                 # (m,)
    dj_dw = (X.T @ errors) / m             # (n,)
    dj_db = np.sum(errors) / m             # scalar
    return dj_dw, dj_db

def gradient_descent(X, y, w_init, b_init, alpha, num_iters):
    w = w_init.copy().astype(float)
    b = float(b_init)
    J_hist = []
    trace = []   # store (cost, w.copy(), b) for plotting diagnostics
    for t in range(1, num_iters+1):
        dj_dw, dj_db = compute_gradient(X, y, w, b)
        w -= alpha * dj_dw
        b -= alpha * dj_db
        J = compute_cost(X, y, w, b)
        J_hist.append(J)
        if t <= 1000:  # limit stored history to keep light
            trace.append((J, w.copy(), b))
    return w, b, np.array(J_hist), trace


## 5) Train **without** scaling (expect slow/unstable with same α)

In [None]:

# Start with small random parameters
n = X_train.shape[1]
w0 = np.zeros(n)
b0 = 0.0

alpha = 1e-7        # tiny step to avoid divergence on unscaled data
iters = 200

w_ns, b_ns, J_ns, tr_ns = gradient_descent(X_train, y_train, w0, b0, alpha, iters)

print("Unscaled   → final cost:", J_ns[-1])
print("Unscaled   → w:", w_ns, "b:", round(b_ns, 2))

plt.figure(figsize=(5,3))
plt.plot(J_ns)
plt.xlabel("iteration")
plt.ylabel("cost")
plt.title("Cost vs iteration (UNSCALED)")
plt.grid(True, alpha=0.3)
plt.show()


## 6) Train **with z‑score scaling** (can use a much larger α)

In [None]:

alpha_s = 5e-2      # much larger step on scaled features
iters_s = 200

w_s, b_s, J_s, tr_s = gradient_descent(X_norm, y_train, w0, b0, alpha_s, iters_s)

print("Scaled (z) → final cost:", J_s[-1])
print("Scaled (z) → w:", w_s, "b:", round(b_s, 2))

plt.figure(figsize=(5,3))
plt.plot(J_s)
plt.xlabel("iteration")
plt.ylabel("cost")
plt.title("Cost vs iteration (SCALED, z‑score)")
plt.grid(True, alpha=0.3)
plt.show()


## 7) Targets vs predictions (using **normalized** model)

In [None]:

# Predictions with normalized features
yp = predict(X_norm, w_s, b_s)

fig, ax = plt.subplots(1, X_norm.shape[1], figsize=(14,3), sharey=True)
for i in range(X_norm.shape[1]):
    ax[i].scatter(X_train[:, i], y_train, label="target")
    ax[i].scatter(X_train[:, i], yp, marker='x', label="predict")
    ax[i].set_xlabel(X_features[i])
ax[0].set_ylabel("Price")
ax[0].legend()
fig.suptitle("Target vs Prediction using z‑score normalized model")
plt.show()


## 8) Learning rate sweeps & parameter trace diagnostics

In [None]:

def plot_cost_and_param_trace(trace, param_index=0, title_suffix=""):
    """Plot (left) cost vs iter and (right) a chosen weight evolution."""
    J = [t[0] for t in trace]
    W = [t[1] for t in trace]
    B = [t[2] for t in trace]
    w_series = [w[param_index] for w in W]

    fig = plt.figure(figsize=(10,3))
    ax1 = plt.subplot(1,2,1)
    ax1.plot(J)
    ax1.set_xlabel("iteration")
    ax1.set_ylabel("cost")
    ax1.grid(True, alpha=0.3)

    ax2 = plt.subplot(1,2,2)
    ax2.plot(w_series)
    ax2.set_xlabel("iteration")
    ax2.set_ylabel(f"w[{param_index}]")
    ax2.grid(True, alpha=0.3)
    fig.suptitle(f"Convergence diagnostics{title_suffix}")
    plt.show()

alphas = [0.5, 0.1, 0.05, 0.01, 0.005]
iters = 120

for a in alphas:
    _, _, _, tr = gradient_descent(X_norm, y_train, np.zeros(X_norm.shape[1]), 0.0, a, iters)
    plot_cost_and_param_trace(tr, 0, title_suffix=f"  (alpha={a})")


## 9) Optional: Convert normalized weights back to original feature scale

In [None]:

# If the model was trained on z‑score normalized features:
#   y ≈ ( (x - mu)/sigma )·w_s + b_s
# Expand to get: y ≈ x·(w_s / sigma) + (b_s - mu·(w_s / sigma))
w_back = w_s / sigma
b_back = b_s - (mu @ w_back)

print("Recovered weights in ORIGINAL scale:")
print("w_back:", w_back)
print("b_back:", round(b_back, 2))
