<a href="https://colab.research.google.com/github/alisony755/DS4400/blob/main/HW2/DS4400_HW2_Problem6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem 6

### 6.2

In [59]:
# Import numpy, pandas, and sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [60]:
def add_bias(X):
  """ Adds a bias term (column of 1s) to the given feature matrix for the intercept

  Args:
    X (numpy.ndarray): Feature matrix

  Returns:
    numpy.ndarray: Feature matrix with bias column

 """

  # Create a column of 1s for the bias term
  ones = np.ones((X.shape[0], 1))

  return np.hstack((ones, X))

In [61]:
def ridge_gradient_descent(X, y, alpha, num_iters, lam):
  """ Trains ridge regression using gradient descent

  Args:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Target vector of shape (n_samples,)
    alpha (float): Learning rate
    num_iters (int): Number of iterations
    lam (float): Regularization parameter lambda

  Returns:
    theta (numpy.ndarray): Learned parameter vector theta

  """

  # Add bias column to features
  X_b = add_bias(X)

  # Get number of samples and features
  n_samples, n_features = X_b.shape

  # Initialize parameters to 0
  theta = np.zeros(n_features)

  # Gradient descent loop
  for i in range(num_iters):

      # Compute predicted values
      y_pred = X_b @ theta

      # Compute errors
      error = y_pred - y

      # Compute gradient including ridge penalty
      gradient = (2 / n_samples) * (X_b.T @ error) + 2 * lam * np.r_[[0], theta[1:]]

      # Update parameters
      theta = theta - alpha * gradient

  return theta

### 6.3

In [62]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
N = 1000

# Simulate X uniformly on [-2, 2]
X_sim = np.random.uniform(-2, 2, size=(N, 1))

# Simulate Y = 1 + 2X + e
y_sim = 1 + 2 * X_sim.flatten() + np.random.normal(0, np.sqrt(2), size=N)

# Scale the feature for stable gradient descent
scaler_sim = StandardScaler()
X_sim_scaled = scaler_sim.fit_transform(X_sim)

In [63]:
def add_bias(X):
  """ Adds a bias term (column of 1s) to the given feature matrix for the intercept

  Args:
    X (numpy.ndarray): Feature matrix

  Returns:
    numpy.ndarray: Feature matrix with bias column

 """

  # Create a column of 1s for the bias term
  ones = np.ones((X.shape[0], 1))

  return np.hstack((ones, X))

In [64]:
def gradient_descent(X, y, alpha, num_iters):
  """ Trains linear regression using gradient descent

  Args:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Target vector of shape (n_samples,)
    alpha (float): Learning rate
    num_iters (int): Number of iterations

  Returns:
    numpy.ndarray: Learned parameter vector theta

  """

  # Add bias column to features
  X_b = add_bias(X)

  # Get number of samples and features
  n_samples, n_features = X_b.shape

  # Initialize parameters to zero
  theta = np.zeros(n_features)

  # Loop through gradient descent steps
  for i in range(num_iters):

      # Compute predicted values
      y_pred = X_b @ theta

      # Compute prediction errors
      error = y_pred - y

      # Compute gradient of cost function
      gradient = (2 / n_samples) * (X_b.T @ error)

      # Update model parameters
      theta = theta - alpha * gradient

  return theta

In [65]:
def ridge_gradient_descent(X, y, alpha, num_iters, lam):
  """ Trains ridge regression using gradient descent

  Args:
      X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
      y (numpy.ndarray): Target vector of shape (n_samples,)
      alpha (float): Learning rate
      num_iters (int): Number of iterations
      lam (float): Regularization parameter Î»

  Returns:
      theta (np.ndarray): Learned parameter vector theta

  """

  # Add bias column
  X_b = add_bias(X)

  # Get sample size and feature count
  n_samples, n_features = X_b.shape

  # Initialize parameters
  theta = np.zeros(n_features)

  # Run gradient descent
  for i in range(num_iters):

      # Compute predictions
      y_pred = X_b @ theta

      # Compute errors
      error = y_pred - y

      # Create regularization term
      reg = np.r_[0, theta[1:]]

      # Compute ridge gradient
      gradient = (2 / n_samples) * (X_b.T @ error) + (2 * lam / n_samples) * reg

      # Update parameters
      theta = theta - alpha * gradient

  return theta

In [66]:
# Fit linear regression
alpha = 0.01
num_iters = 500
theta_linear = gradient_descent(X_sim_scaled, y_sim, alpha, num_iters)

# Predictions and metrics
X_b_sim = add_bias(X_sim_scaled)
y_pred_linear = X_b_sim @ theta_linear
mse_linear = mean_squared_error(y_sim, y_pred_linear)
r2_linear = r2_score(y_sim, y_pred_linear)

# Print slope, MSE, and R^2
print("Linear Regression:")
print(f"  Slope: {theta_linear[1]:.4f}, MSE: {mse_linear:.4f}, R2: {r2_linear:.4f}\n")

Linear Regression:
  Slope: 2.2719, MSE: 1.9499, R2: 0.7258



In [67]:
# Test ridge regression with different regularization strengths
lambdas = [1, 10, 100, 1000, 10000] # Initialize lambda values
ridge_results = []

# Train ridge models for each lambda and store slope, MSE, and R^2
for lam in lambdas:
    theta_ridge = ridge_gradient_descent(X_sim_scaled, y_sim, alpha, num_iters, lam)
    y_pred_ridge = X_b_sim @ theta_ridge
    mse_ridge = mean_squared_error(y_sim, y_pred_ridge)
    r2_ridge = r2_score(y_sim, y_pred_ridge)
    ridge_results.append([lam, theta_ridge[1], mse_ridge, r2_ridge])

In [68]:
# Convert to DataFrame for display
ridge_df = pd.DataFrame(ridge_results, columns=["Lambda", "Slope", "MSE", "R2"])
print("Ridge Regression Results:")
print(ridge_df)

Ridge Regression Results:
   Lambda     Slope       MSE        R2
0       1  2.269651  1.949941  0.725823
1      10  2.249435  1.950446  0.725752
2     100  2.065436  1.992610  0.719823
3    1000  1.136007  3.240447  0.544368
4   10000  0.206547  6.216088  0.125969
