# ___Gradient Descent For Multiple Linear Regression___
--------------

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# model parameters

### ___$= [w_1~~w_2~~w_3 \cdots w_n], b ~~\text{alternatively,}~~ = \overrightarrow{w}, b$___

In [None]:
# model

### ___$= f_{w,b}(x) = w_1x_1 + w_2x_2 + w_3x_3 \cdots + w_nx_n + b ~~\text{alternatively,}~~ f_{w,b}(x) = \overrightarrow{x} \cdot \overrightarrow{w} + b$___

In [None]:
# cost function

### ___$= j(w_1, w_2, w_3 \cdots b) ~~\text{alternatively,}~~ = j(\overrightarrow{w}, b)$___

In [2]:
# how does a least squares cost function operate on a multiple predictor dataset?
# N - number of records in the training dataset.

## ___Univariate $\Rightarrow$___
## ___$j(w, b) = \frac{1}{2N}\sum_{i=0}^{N}(f_{w,b}(x_i) - y_i)^2$___
## ___$j(w, b) = \frac{1}{2N}\sum_{i=0}^{N}(w_ix_i + b - y_i)^2$___

## ___Multiple variables $\Rightarrow$___
## ___$j(\overrightarrow{w}, b) = \frac{1}{2N}\sum_{i=0}^{N}(f_{w,b}(\overrightarrow{x_i}) - y_i)^2$___
## ___$j(\overrightarrow{w}, b) = \frac{1}{2N}\sum_{i=0}^{N}(\overrightarrow{w_i} \cdot \overrightarrow{x_i} + b - y_i)^2$___

In [None]:
# for multiple linear regression, we'll have separate partial derivatives of cost functions for each predictor, computing how the prediction deviates
# from the actual value with changes in the select predictor (hence the use of partial derivatives!)
# for a dataset with n predictors, we'll have cost functions 0 through n, corresponding to each predictor.

In [3]:
# here's what the gradient descent looks like,
# repeat until convergence [W_I DENOTES THE COEFFICIENT OF THE ITH VARIABLE OR PREDICTOR]

## ___$w_i = w_i - \alpha \cdot \frac{\partial{j(\overrightarrow{w}, b)}}{\partial{w_i}}$___
## ___$b = b - \alpha \cdot \frac{\partial{j(\overrightarrow{w}, b)}}{\partial{b}}$___

## ___HERE'S A DEEP DIVE INTO THE WEIGHTS UPATE___
------------------------

In [8]:
# for univariate linear regression, we had the following rules to get the derivative of the cost function with respect to w

## ___$ \frac{\partial}{\partial{w}}j(w, b) = \frac{\partial}{\partial{w}} \frac{1}{2N} \sum_{i=0}^{N} (f_{w, b}(x_i) - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{\partial}{\partial{w}} \frac{1}{2N} \sum_{i=0}^{N} (w_ix_i + b - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{1}{N} \sum_{i=0}^{N} (w_ix_i + b - y_i) \times x_i$___

In [2]:
# for multiple linear regression,
# REMEMBER Y AND B ARE ALWAYS SCALARS!
# we cannot compute the derivative of the cost function with respect to the vector of weights
# DERIVATIVE OF THE COST FUNCTION WITH RESPECT TO EACH WEIGHT MUST BE COMPUTED INDIVIDUALLY AS PARTIAL DERIVATIVES

## ___$\frac{\partial}{\partial{w_1}}j(\overrightarrow{w}, b) = \frac{\partial}{\partial{w_1}} \frac{1}{2N} \sum_{i=0}^{N} (f_{w, b}(\overrightarrow{x_i}) - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{\partial}{\partial{w_1}} \frac{1}{2N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{1}{N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i) \times x_{1i}$___

## ___$~~~~~~~~~~~~~~~~~~~~\vdots$___

## ___$\frac{\partial}{\partial{w_n}}j(\overrightarrow{w}, b) = \frac{\partial}{\partial{w_n}} \frac{1}{2N} \sum_{i=0}^{N} (f_{w, b}(\overrightarrow{x_i}) - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{\partial}{\partial{w_n}} \frac{1}{2N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{1}{N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i) \times x_{ni}$___


In [4]:
# for a dataset with n predictors, we need to repeat this for all n weights!
# NOTE THAT THE X RESULTING FROM THE DERIVATION REPRESENTS THE X VALUE FOR THE SELECTED WEIGHT i.e the first predictor, NOT THE WHOLE ROW VECTOR!
# IT IS A SCALAR!

In [5]:
# for the bias term in univariate linear regression,
# the derivative of the cost function with respect to the bias term is,

## ___$ \frac{\partial}{\partial{b}}j(w, b) = \frac{\partial}{\partial{b}} \frac{1}{2N} \sum_{i=0}^{N} (f_{w, b}(x_i) - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{\partial}{\partial{w}} \frac{1}{2N} \sum_{i=0}^{N} (w_ix_i + b - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{1}{N} \sum_{i=0}^{N} (w_ix_i + b - y_i)$___

In [6]:
# good thing, even in multiple linear regression, we have only one bias term!

## ___$\frac{\partial}{\partial{b}}j(\overrightarrow{w}, b) = \frac{\partial}{\partial{b}} \frac{1}{2N} \sum_{i=0}^{N} (f_{w, b}(\overrightarrow{x_i}) - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{\partial}{\partial{b}} \frac{1}{2N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i)^2$___
## ___$~~~~~~~~~~~~~~~~~= \frac{1}{N} \sum_{i=0}^{N} (\overrightarrow{w_i}\cdot\overrightarrow{x_i} + b - y_i)$___

In [None]:
# IN GRADIENT DESCENT THESE PARTIAL DERIVATIVES NEED TO BE MULTIPLIED BY THE ALPHA AND SUBTRACTED FROM THE INITIAL COGNATE PARAMETERS
# I.E SIMULTANEOUS UPDATE OF EACH WEIGHT AND THE THE BIAS TERM!