In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# 1. Part 1 -- DataGeneration function  

In [2]:
## function to create the dataset to work on . 
## std_dev - Stand. Deviation , n - Size of xi and m - size of Beta  
def generate_dataset(m, n, std_dev):
    # We calculate the number of predictors, and create a beta_matrix
    # With `m+1`(beta_matx) rows and 1 column, for matrix multiplication
    beta_matx = np.random.rand(m+1,1)
    # Similar as before, but with `n` rows and `beta_matrix` columns this time
    x = np.random.random_sample((n,m))
    x = np.hstack((np.matrix(np.ones(x.shape[0])).T, x))
    e = np.random.normal(0, std_dev, (n,1))
    # Since x is a n*beta_matrix, and coefficients is a beta*1 matrix
    # we can use matrix multiplication to get the value of y for each
    # set of values x1, x2 .. xp
    # We need to transpose it to get a 1*n array from a n*1 matrix to use in the regression model
    #print(beta_matx.shape,x.shape)
    y = np.matmul(x, beta_matx)  + e
      
    return beta_matx,x, y,e

# 2. Part 2 -- Gradient Dec. function  

In [3]:
# We are using it to calculate cost 
def MSE_cost(Y_hat,Y,n):
    cost = 1/(2*n) * np.dot((Y-Y_hat).T, (Y-Y_hat))
    return cost 


In [4]:
def MSE_grad(X,Y_hat,Y,n):
    grad = 1/n * np.dot(X.T,(Y_hat - Y))
    return grad 

In [10]:
# calculating multivarient linear_regression
def LR(X,Y,epochs,threshold,lr):
    cost_list = []
    n = X.shape[0]
    m = X.shape[1]
    beta = np.ones((m,1), dtype = int) 
    prev_cost = float('inf')
    for i in range(epochs):
        Y_hat = np.dot(X,beta)
        #print(beta.shape)
        cost = MSE_cost(Y_hat,Y,n)
        cost_list.append(cost)
        # if it reaches the threshold it will come out of the loop...
        if(prev_cost - cost <= threshold):
            break  
        prev_cost = cost
        grad = MSE_grad(X,Y_hat,Y,n)
        beta = beta - lr * grad 
    return np.squeeze(cost), beta,cost_list

In [11]:
# We now have an array of coefficients, instead of a single one
m = 2  # size of beta 
n = 100 # size for xi
std_dev = .1
Original_beta,X, y,e = generate_dataset(m, n, std_dev)

In [15]:
# Shape of the input data 
print(Original_beta.shape,X.shape,y.shape,e.shape)

(3, 1) (100, 3) (100, 1) (100, 1)


In [16]:
print("Original_beta = {}, X = {}, y = {}, e = {}".format(Original_beta,X,y,e))

Original_beta = [[0.17984201]
 [0.54165945]
 [0.12460277]], X = [[1.         0.84727365 0.70371933]
 [1.         0.92980007 0.8496215 ]
 [1.         0.57715976 0.24120899]
 [1.         0.17433266 0.78310061]
 [1.         0.18885338 0.55385844]
 [1.         0.44627179 0.24209172]
 [1.         0.96665739 0.28612196]
 [1.         0.06822171 0.67981514]
 [1.         0.51442318 0.44282098]
 [1.         0.2260873  0.99106646]
 [1.         0.9684999  0.22504111]
 [1.         0.07490763 0.34747127]
 [1.         0.79825753 0.21086097]
 [1.         0.50064334 0.47294256]
 [1.         0.61290018 0.3271913 ]
 [1.         0.57844012 0.32629558]
 [1.         0.79842513 0.7653811 ]
 [1.         0.36422214 0.01970919]
 [1.         0.35953773 0.81006502]
 [1.         0.99989533 0.91306543]
 [1.         0.15091937 0.2354647 ]
 [1.         0.36085548 0.96707901]
 [1.         0.85425807 0.1372076 ]
 [1.         0.9907277  0.49013619]
 [1.         0.60048746 0.36260567]
 [1.         0.17416312 0.10002682]


# Testing the grad_fun - Test 1 

In [12]:
epochs = 10000
threshold = 0.00001
lr = 0.001 
cost,beta,cost_list = LR(X,y,epochs,threshold,lr)

In [18]:
print("Cost - {}, \n beta - {} \n Original_beta - {}".format(cost,beta,Original_beta))

Cost - [[0.01373251]], 
 beta - [[0.08866592]
 [0.52907867]
 [0.48463565]] 
 Original_beta - [[0.17984201]
 [0.54165945]
 [0.12460277]]


# Conclusion -- 

## If the size of data is less then < 100  then the cost is large and prediction is not at all accurate. 

# Test - 2 

## changing the Input value .. 

In [58]:

m = 3  # size of beta 
n = 150 # size for xi
std_dev = .1
Original_beta,X, y,e = generate_dataset(m, n, std_dev)

epochs = 10000
threshold = 0.00001
lr = 0.0001 
cost,beta,cost_list = LR(X,y,epochs,threshold,lr)

In [59]:
print("Cost - {}, \n beta - {} \n Original_beta - {}".format(cost,beta,Original_beta))

Cost - [[0.03601662]], 
 beta - [[0.47652296]
 [0.71925274]
 [0.73750976]
 [0.74033006]] 
 Original_beta - [[0.17173212]
 [0.66258196]
 [0.80781447]
 [0.84051408]]


# Currently we can't figure out using this data as it is not constant. However while increasing the number of parameters the cost increased and kept on increasing. Moreover while reducing its parameters we don't get proper results as it cannot establish any relation. 