# Linear Regression

A module to implement the gradient descent algorithm for linear regression with multiple variables.

In [1]:
import pandas as pd
import numpy as np

In [2]:
class LinReg:
    def __init__(self, x, y):
        """
        x: a dataframe of the independent variables
        y: the dependent variable
        """
        # randomly initialise the coefficients
        self.theta = np.array([0,0])
        
        # prepend column of ones onto the vector x to allow vectorization
        self.x = x
        self.x.insert(0, 'x0', 1)
        self.y = y
        
    def predict(self, x=None):
        """
        returns the prediction for input vector x
        """
        if x is None:   
            x = self.x
            
        return x.dot(self.theta)
    
    def compute_cost(self, i):
        """
        compute cost based on current value of theta
        """
        cost = (1/2*len(self.x)) * ((self.predict()-self.y)**2).sum()
        # print(f'Iteration {i}: {cost}')
        
    def gradient_descent(self, lr, num_iters):
        for i in range(num_iters):
            # gradient descent update rule
            gradient = (self.predict()-self.y).T.dot(self.x)
            temp = lr*(1/len(self.x))*gradient
        
            self.theta = self.theta - np.array(temp)
            self.compute_cost(i)
            

## Data Preparation


In [3]:
path = 'data/restaurant.csv'

In [4]:
data = pd.read_csv(f'{path}')
data.head()

Unnamed: 0,population,profit
0,6.1101,17.592
1,5.5277,9.1302
2,8.5186,13.662
3,7.0032,11.854
4,5.8598,6.8233


Separate data into training and validation sets

In [5]:
# randomly shuffle dataframe
data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)

train_ratio = 0.80
train_size = int(train_ratio*len(data))
train = data.iloc[:train_size].copy()
valid = data.iloc[train_size:].copy()

train_y = train.profit
train_x = train.drop(labels='profit', axis=1)

valid_y = valid.profit
valid_x = valid.drop(labels='profit', axis=1)
valid_x.insert(0, 'x0', 1)

In [15]:
model = LinReg(train_x, train_y)

In [16]:
model.gradient_descent(0.02, 13000)

In [17]:
model.theta

array([-3.57336581,  1.17387157])

In [18]:
preds = model.predict(valid_x)

In [19]:
preds

77     2.360085
78     2.906875
79    20.304356
80     4.075464
81     3.048092
82     3.007710
83    10.160932
84     3.157614
85    22.490105
86     9.288745
87     6.424733
88     2.959112
89     4.753023
90     3.151979
91     5.275982
92     5.277861
93     2.808857
94     3.342851
95     7.355613
96     2.338838
dtype: float64

In [20]:
valid_y

77     3.81660
78     0.15200
79    20.99200
80     5.34360
81     0.71618
82     3.39280
83     8.00430
84     3.15510
85    24.14700
86     7.04670
87     4.24150
88     3.08250
89     1.04630
90     0.47953
91     3.88450
92     6.75040
93     0.61705
94     1.84950
95     3.96240
96     5.70140
Name: profit, dtype: float64

In [14]:
train_x.drop(labels='x0', axis=1, inplace=True)