# Multivariate Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data collection

In [2]:
nrows = 100
data = pd.read_csv("./datasets/student_performance.csv", usecols=["study_hours_per_day", "sleep_hours", "attendance_percentage", "exam_score"], nrows=nrows)
data.rename({
  "study_hours_per_day": "study_hours",
  "sleep_hours": "sleep_hours",
  "attendance_percentage": "attendance",
  "exam_score": "score"
}, axis=1, inplace=True)
data.shape

(100, 4)

In [3]:
data.head(7)

Unnamed: 0,study_hours,attendance,sleep_hours,score
0,0.0,85.0,8.0,56.2
1,6.9,97.3,4.6,100.0
2,1.4,94.8,8.0,34.3
3,1.0,71.0,9.2,26.8
4,5.0,90.9,4.9,66.4
5,7.2,82.9,7.4,100.0
6,5.6,85.8,6.5,89.8


In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   study_hours  100 non-null    float64
 1   attendance   100 non-null    float64
 2   sleep_hours  100 non-null    float64
 3   score        100 non-null    float64
dtypes: float64(4)
memory usage: 3.3 KB


In [5]:
featuresMatrix = data[["study_hours", "sleep_hours"]]
targetArray = data["score"]

## Algorithm implemetation

In [6]:
def gradient_descent(w_now, X_mod, y_tar, lr=0.05):
  ''' 
  w_now : current weights array of order (1, n+1)
  w_new : new weights array of order (1, n+1)
  X_mod : modified features matrix of order (m, n+1), a column of ones is added.
  y_hat : prediction array of order (m, 1)
  y_tar : target array of of order (m, 1)
  lr    : learning rate
  m     : no. of records
  n     : no. of features (including bias)
  '''
  m, n = X_mod.shape
  y_hat = (X_mod @ w_now.T)
  dw = (2 / m) * ((y_hat.T - y_tar.T) @ X_mod)
  w_new = w_now - lr * dw
  return w_new

class MultivariateLinearRegression():
  def __init__(self):
    pass
  def fit(self, X, y, epochs=100) -> None:
    m, n = X.shape

    if isinstance(X, pd.DataFrame):
      X = X.values # DataFrame to numpy array conversion.
    # make sure to not to change original dataframe
    X_mod = np.hstack((np.ones((m, 1)), X.copy())) 
    
    if isinstance(y, pd.Series):
      y = y.values # series to numpy conversion
    # make sure to not to change original series
    y_tar = y.copy().reshape(m, 1)

    self.w = np.zeros((1, n + 1))
    for i in range(epochs):
      self.w = gradient_descent(self.w, X_mod, y_tar, lr=0.0001)
    
    self.Xfit = X
    self.yfit = X_mod @ self.w.T

## Trying out with 2 features

In [7]:
model2d = MultivariateLinearRegression()
model2d.fit(data[["study_hours", "sleep_hours"]], data["score"], epochs=1000)

pd.DataFrame({
  "study_hours": data['study_hours'],
  "sleep_hours": data['sleep_hours'],
  "target_score": data['score'],
  "predicted_score": model2d.yfit.reshape(-1),
})

Unnamed: 0,study_hours,sleep_hours,target_score,predicted_score
0,0.0,8.0,56.2,53.952023
1,6.9,4.6,100.0,78.162730
2,1.4,8.0,34.3,63.402702
3,1.0,9.2,26.8,68.596968
4,5.0,4.9,66.4,67.310424
...,...,...,...,...
95,1.7,6.0,50.2,52.270414
96,6.0,7.1,87.9,88.534086
97,4.1,5.4,80.9,64.524347
98,2.4,4.8,63.9,49.101293


## Trying out with 3 features

In [8]:
model3d = MultivariateLinearRegression()
model3d.fit(data[["study_hours", "sleep_hours", "attendance"]], data['score'], epochs=1000)

pd.DataFrame({
  "study_hours": data['study_hours'],
  "sleep_hours": data['sleep_hours'],
  "attendance": data['attendance'],
  "target_score": data['score'],
  "predicted_score": model2d.yfit.reshape(-1),
})

Unnamed: 0,study_hours,sleep_hours,attendance,target_score,predicted_score
0,0.0,8.0,85.0,56.2,53.952023
1,6.9,4.6,97.3,100.0,78.162730
2,1.4,8.0,94.8,34.3,63.402702
3,1.0,9.2,71.0,26.8,68.596968
4,5.0,4.9,90.9,66.4,67.310424
...,...,...,...,...,...
95,1.7,6.0,97.5,50.2,52.270414
96,6.0,7.1,92.3,87.9,88.534086
97,4.1,5.4,90.4,80.9,64.524347
98,2.4,4.8,92.2,63.9,49.101293
