**Submission By: Ayesh Ahmad**

**CMS: 365966**

In [95]:
import numpy as np
import time

#1. Linear Regression Single Prediction

In [46]:
def linear_regression_single_prediction(w, x):
  """
  Args:
    w (ndarray): Weight vector
    x (ndarray): Feature vector

  Returns:
    y_pred (float): Linear regression prediction for the given input.
  """
  if len(w.shape) == 1 and len(x.shape) == 1:
    y_pred = np.dot(w, x)
    return y_pred
  else:
    raise ValueError("'w' and 'x' must be vectors.")

w = np.array([1, 2, 3])
x = np.array([1, 2, 3])

test = linear_regression_single_prediction(w, x)
print(test)

14


#2. Linear Regression Vector Prediction

In [65]:
def linear_regression_vector_prediction(w, x):
  """
  Args:
    w (ndarray): Weight vector
    x (ndarray): Feature matrix

  Returns:
    y_pred (float): Linear regression prediction for the given input.
  """
  if len(w.shape) == 1 and type(x == 'numpy.ndarray'):
    y_pred = np.dot(w, x.T)
    return y_pred
  else:
    raise ValueError("'w' must be a vector and 'x' must be a 'numpy.ndarray'.")

w = np.array([1, 2, 3])
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
test = linear_regression_vector_prediction(w, x)
print(test)

[14 32 50]


#3. Mean Squared Error

In [49]:
def mean_squared_error(y, y_pred):
  """
  Args:
    y (ndarray): True targets associated with the given predictions.
    y_pred (ndarray): Predicted targets generated by the regression model.

  Returns:
    mse (float): The computed Mean Squared Error.
  """
  mse = np.mean((y_pred - y) ** 2)
  return mse

y = np.array([2, 2.5, 3.5, 4.5])
y_pred = np.array([1, 2, 3, 4])
print(mean_squared_error(y, y_pred))

0.4375


#4. Mean Squared Error Gradient

In [50]:
def mean_squared_error_gradient(x, y, y_pred):
  """
  Args:
    x (ndarray (m,n)): Feature matrix, m examples and n training features
    y (ndarray (m,)): True targets associated with the given predictions
    y_pred (ndarray (m,)): Predicted targets generated by the regression model, m examples
  Returns:
    mse_gradient (ndarray (n, )): The gradient of the cost w.r.t. the parameter w
  """
  N = len(y_pred)
  mse_gradient = (2 * np.dot((y_pred - y), x)) / N
  return mse_gradient

x = np.array([[1, 2], [2, 3], [3, 4]])
y_pred = np.array([4.5, 5, 6])
y = np.array([5, 5.5, 6])

print(mean_squared_error_gradient(x, y, y_pred))

[-1.         -1.66666667]


#5i. Vanilla Gradient Descent Algorithm


In [52]:
def gradient_descent(w, x, y, epochs = 100, learning_rate = 0.01):
  """
  Args:
    w (ndarray): Weight vector
    x (ndarray): Feature matrix
    y (ndarray): True target values
    epochs (int): Number of iterations over the entire dataset during training. Default is 100.
    learning_rate (float): Step size for weight updates in each iteration. Default is 0.01.
  Returns:
    w (ndarray): Optimized weight vector
  """
  print("+---------------+-------------------------------+")
  print(f"|\tEpoch\t|\t\tLoss\t\t|")
  print("+---------------+-------------------------------+")
  for i in range(epochs):
    y_pred = linear_regression_vector_prediction(w, x)
    loss = mean_squared_error(y, y_pred)
    loss_gradient = mean_squared_error_gradient(x, y, y_pred)
    w = w - learning_rate * loss_gradient
    print(f"|\t{i+1}\t|\t{loss}\t|")
  print("+---------------+-------------------------------+")
  return w

w = np.array([0, 0 , 0])
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([0.85, 2, 3])
y_pred = np.array([1, 3, 5])

updated_w = gradient_descent(w, x, y, learning_rate=0.01, epochs=10)

print("Updated Weights: ", updated_w)

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	4.574166666666667	|
|	2	|	3.5254466666666673	|
|	3	|	1.3806259306666675	|
|	4	|	1.376739359505067	|
|	5	|	0.539851000357772	|
|	6	|	0.6931694666200549	|
|	7	|	0.33459876093368407	|
|	8	|	0.4581345981024986	|
|	9	|	0.29117782528901537	|
|	10	|	0.3668312458997769	|
+---------------+-------------------------------+
Updated Weights:  [0.17165322 0.14815694 0.12466066]


#5ii. Stochastic Gradient Descent Algorithm

In [58]:
def stochastic_mean_squared_error_gradient(x, y, y_pred):
  """
  Args:
    x (ndarray (n,)): Feature vector for a single example
    y (float): True target for the example
    y_pred (float): Predicted target for the example
  Returns:
    mse_gradient (ndarray (n, )): The gradient of the cost w.r.t. the parameter w for a single example
  """
  mse_gradient = 2 * (y_pred - y) * x
  return mse_gradient

x = np.array([[1, 2], [2, 3], [3, 4]])
y_pred = np.array([4.5, 5, 6])
y = np.array([5, 5.5, 6])
N = len(y)

for i in range(N):
  gradient = stochastic_mean_squared_error_gradient(x[i], y[i], y_pred[i])
  print(f"Training Example {i+1}: {gradient}")

################################################################################

def stochastic_gradient_descent(w, x, y, epochs=100, learning_rate=0.01):
  """
  Args:
    w (ndarray): Weight vector
    x (ndarray): Feature matrix
    y (ndarray): True target values
    epochs (int): Number of passes over the entire dataset during training. Default is 100.
    learning_rate (float): Step size for weight updates in each iteration. Default is 0.01.
  Returns:
    w (ndarray): Optimized weight vector
  """
  N = len(y)
  print("\n+---------------+-------------------------------+")
  print(f"|\tEpoch\t|\t\tLoss\t\t|")
  print("+---------------+-------------------------------+")
  for epoch in range(epochs):
    total_loss = 0
    for i in range(N):
      y_pred = np.dot(w, x[i])
      loss = (y_pred - y[i])**2
      total_loss += loss
      gradient = stochastic_mean_squared_error_gradient(x[i], y[i], y_pred)
      w = w - learning_rate * gradient
    average_loss = total_loss / N
    print(f"|\t{epoch+1}\t|\t{average_loss}\t|")
  print("+---------------+-------------------------------+")
  return w

w = np.array([0, 0, 0])
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([0.85, 2, 3])

updated_w = stochastic_gradient_descent(w, x, y, learning_rate=0.001, epochs=10)

print("Updated Weights:", updated_w)


Training Example 1: [-1. -2.]
Training Example 2: [-2. -3.]
Training Example 3: [0. 0.]

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	3.4875982009523203	|
|	2	|	0.8962700205458672	|
|	3	|	0.2342247756430644	|
|	4	|	0.06443068464300385	|
|	5	|	0.02055274294055513	|
|	6	|	0.009044324548857117	|
|	7	|	0.0059377085445734635	|
|	8	|	0.0050520169782082005	|
|	9	|	0.004773255245258796	|
|	10	|	0.004670000067606186	|
+---------------+-------------------------------+
Updated Weights: [0.10240394 0.12466721 0.14693049]


#6. Application of Linear Regression

##i. Loading dataset and normalizing

In [59]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
california_housing = datasets.fetch_california_housing()
X = california_housing.data  # Feature matrix
y = california_housing.target  # Target values (median house values)

def normalize_data(X):
  """
  Args:
    X (ndarray): Feature matrix
  Returns:
    X_scaled_with_bias (ndarray): Normalized feature matrix with a bias column added
    scaler (StandardScaler): StandardScaler object fitted to the input data
  """
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)
  # Adding a bias column
  ones_column = np.ones((X_scaled.shape[0], 1))
  X_scaled_with_bias = np.hstack((ones_column, X_scaled))

  return X_scaled_with_bias, scaler

##ii. Creating a Linear Regression Model

In [80]:
def linear_regression(X, y, epochs = 20, learning_rate = 0.1, gd_type = "vanilla"):
  """
  Args:
    X (ndarray): Feature matrix
    y (ndarray): True target values
    epochs (int): Number of passes over the entire dataset during training. Default is 10.
    type (str): Type of gradient descent to use. 'vanilla'/'stochastic'. Default is 'vanilla'.
  Returns:
    w (ndarray): Optimized weight vector
  """
  X_normalized, scaler = normalize_data(X)
  w = np.zeros(X_normalized.shape[1])

  if gd_type == 'vanilla':
    w = gradient_descent(w, X_normalized, y, epochs, learning_rate)
  elif gd_type == 'stochastic':
    w = stochastic_gradient_descent(w, X_normalized, y, epochs, learning_rate)
  else:
    raise ValueError("Invalid gd_type. Choose 'vanilla' or 'stochastic'.")

  return w

##iii. Linear Regression using Vanilla Gradient Descent

In [101]:
start_time = time.time()
linear_regression_vanilla = linear_regression(X, y, epochs = 1, learning_rate = 0.1, gd_type='vanilla')
end_time = time.time()
execution_time = end_time - start_time

print("Optimized weights:", linear_regression_vanilla)
print("Execution time:", execution_time, "seconds")

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	5.610483198987253	|
+---------------+-------------------------------+
Optimized weights: [ 0.41371163  0.15879788  0.02437637  0.03506748 -0.01077781 -0.00568879
 -0.00547825 -0.03327012 -0.01060843]
Execution time: 0.013284921646118164 seconds


##iv. Linear Regression using Stochastic Gradient Descent

In [102]:
start_time = time.time()
linear_regression_stochastic = linear_regression(X, y, epochs = 1, learning_rate = 0.0001, gd_type='stochastic')
end_time = time.time()
execution_time = end_time - start_time

print("Optimized weights:", linear_regression_stochastic)
print("Execution time:", execution_time, "seconds")


+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	1.1828672141252614	|
+---------------+-------------------------------+
Optimized weights: [ 1.92453705e+00  7.85976652e-01  2.38430376e-01 -3.18194993e-04
 -1.75919313e-02  2.60736288e-02  4.08018064e-03 -2.06013965e-01
 -2.38376653e-01]
Execution time: 0.12233567237854004 seconds


#7. Results
| Gradient Descent Type | Step Size | Epochs | Loss after Training | Execution Time (s) |
|-------------|-----------|---------|------------|----------|
| Vanilla     | 0.1       | 20      | 0.599      |  0.052   |
| Vanilla     | 0.1       | 1       | 5.610      |  0.013   |
| Stochastic  | 0.0001    | 20      | 0.515      |  2.844   |
| Stochastic  | 0.0001    | 1       | 1.182      |  0.122   |


##- Vanilla Gradient Descent
Vanilla GD generally requires more epochs to do well, performing very poorly with just 1 epoch. However, Vanilla GD is **extremely fast**, even with a high number of epochs, due to the relatively smaller size of the dataset allowing for a quicker batch process.

##- Stochastic Gradient Descent
Stochastic GD performs well overall, with either 1, or 20 epochs. It required a thousandth of the learning rate of vanilla GD as well. The only downside is that it is noticably slower.

##Conclusion
Given the dataset we were tasked to work on, Stochastic Gradient Descent is a good choice for us. Its loss is significantly better than vanilla GD at lower epochs, and comparable yet still better at higher epochs. The only downside to it is the execution time which is not of much concern in this application as 2 seconds is a bearable amount of time.

If the dataset were to be significantly larger, or if we were to increase the number of epochs significantly, then the execution time would prove too unbearably large to dismiss, and Vanilla GD would be a better option.