# Linear Regression implementation using Matrix Algebra

1. It assumes a linear model/relationship between input and output variables.

2. Cost function that is minimized is MSE

3. time complexity is n$^2$p + p$^3$ whre n is number of observations and p is number of features

4. coefficients are given by 
${\beta} = (X^TX)^{-1}(X^TY)$, where ${\beta}$ is a vector of dimension ($num\_features \times 1$) and $X$ is a matrix of dimension ($num\_observations \times num\_features$), $Y$ is a vector of dimension ($num\_observations \times 1$). 

In [20]:
import numpy as np

class LinearRegression:
  def __init__(self, X, y):
        ones = np.ones((X.shape[0], 1))
        self.X = np.append(ones, X, axis=1)
        self.y = y       
        self.beta = np.zeros((self.X.shape[1], 1))
  
  def fit(self):
    self.beta = np.dot(np.linalg.pinv(np.dot(self.X.T, self.X)), np.dot(self.X .T, self.y))
    print("Beta = ", self.beta )
    yhat = np.matmul(self.X, self.beta)
    RSS = np.sum(np.square(self.y - yhat))
    TSS = np.sum(np.square(self.y-np.mean(self.y)))
    #R2 = 1 - RSS/TSS
    r2 = 1 - RSS/TSS
    print("R2 = ", r2)
    return self.beta

  
  def predict(self, X):
    ones = np.ones((X.shape[0], 1))
    X = np.append(ones, X, axis=1)
    yhat = np.matmul(X, self.beta)
    return yhat

## Model diabetes dataset

In [8]:
import pandas as pd
from sklearn import datasets

data = datasets.load_diabetes(as_frame=True)

In [10]:
data['frame'].head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [11]:
data['frame'].isna().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [12]:
X = data["frame"].drop("target",axis=1)
y = data["frame"]["target"]

In [21]:
lr = LinearRegression(X, y)
lr.fit()

Beta =  [ 152.13348416  -10.01219782 -239.81908937  519.83978679  324.39042769
 -792.18416163  476.74583782  101.04457032  177.06417623  751.27932109
   67.62538639]
R2 =  0.5177494254132935


array([ 152.13348416,  -10.01219782, -239.81908937,  519.83978679,
        324.39042769, -792.18416163,  476.74583782,  101.04457032,
        177.06417623,  751.27932109,   67.62538639])

## compare with sklearn

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)
print("Coeff =",model.coef_)
print("Intercept =",model.intercept_)

Coeff = [ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163
  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]
Intercept = 152.1334841628965


# in our implementation beta[0] is intercept. so it matches good!!!