# Linear Regression

A module to implement the gradient descent algorithm for linear regression with multiple variables.

In [1]:
import pandas as pd
import numpy as np

In [2]:
class LinReg:
    def __init__(self, x, y):
        """
        x: a dataframe of the independent variables
        y: the dependent variable
        """
        # randomly initialise the coefficients
        self.theta = np.array([0,0])
        
        # prepend column of ones onto the vector x to allow vectorization
        self.x = x
        self.x.insert(0, 'x0', 1)
        self.y = y
        
    def predict(self, x=None):
        """
        returns the prediction for input vector x
        """
        if x is None:   
            x = self.x
            
        return x.dot(self.theta)
    
    def compute_cost(self, i):
        """
        compute cost based on current value of theta
        """
        print(f'Iteration {i}: {(self.predict()**2).sum()/2*len(self.x)}')
        
    def gradient_descent(self, lr, num_iters):
        for i in range(num_iters):
            # gradient descent update rule
            gradient = (self.predict()-self.y).T.dot(self.x)
            temp = lr*(1/len(self.x))*gradient
            return temp
        
            #self.theta = self.theta - np.array(temp).reshape((len(temp),1))
            #self.compute_cost(i)
            

## Data Preparation


In [3]:
path = 'data/restaurant.csv'

In [4]:
data = pd.read_csv(f'{path}')
data.head()

Unnamed: 0,population,profit
0,6.1101,17.592
1,5.5277,9.1302
2,8.5186,13.662
3,7.0032,11.854
4,5.8598,6.8233


Separate data into training and validation sets

In [5]:
# randomly shuffle dataframe
data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)

train_ratio = 0.80
train_size = int(train_ratio*len(data))
train = data.iloc[:train_size].copy()
valid = data.iloc[train_size:].copy()

train_y = train.profit
train_x = train.drop(labels='profit', axis=1)

valid_y = valid.profit
valid_x = valid.drop(labels='profit', axis=1)
valid_x.insert(0, 'x0', 1)

In [6]:
model = LinReg(train_x, train_y)

In [7]:
model.gradient_descent(0.01, 5000)

Iteration 0: 0    79456.827154
1    79456.827154
dtype: float64


ValueError: cannot reshape array of size 154 into shape (77,1)

In [8]:
model.theta

array([[0.05330785, 0.05330785],
       [0.59328541, 0.59328541]])

In [None]:
model.theta.shape

In [None]:
model.predict()

In [None]:
preds

In [None]:
valid_y

In [None]:
model.theta

In [None]:
valid_x