# Linear Regression from scratch

In [1]:
import csv
import random
import math
import numpy as np

## load train and test data

In [None]:
def loadDataset(filename):
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        headers = dataset[0]
        dataset = dataset[1: len(dataset)]
        return dataset, headers
    
train_data, train_headers = loadDataset('./data/train.csv')
test_data, test_headers = loadDataset('./data/test.csv')

train_data = np.array(train_data, dtype=float)
test_data = np.array(test_data, dtype=float)
train_data = np.nan_to_num(train_data, nan=-1)
test_data = np.nan_to_num(test_data, nan=-1)



In [14]:
import pandas as pd
from preprocessing import trainTestClean

df_train, df_test = trainTestClean()


In [15]:
class Zscore:
    def __init__(self, arr):
        self.arr = arr.copy().astype(float)
        self.col_mean = np.mean(arr, axis=0)
        self.col_std = np.std(arr, axis=0)
        self.normalized = np.array([])

    def normalize(self):
        self.normalized = (self.arr-self.col_mean)/(self.col_std+1e-15)
        return self.normalized

    def deNormalize(self, normalized):
        normal = normalized * self.col_std + self.col_mean
        return normal
 
zscore_train = Zscore(df_train.values)
train_data = zscore_train.normalize()
zscore_test = Zscore(df_test.values)
test_data = zscore_test.normalize()


## X_train without id and price and y is target so = price column

In [16]:
X_train = train_data[:, 1:-1]
y_train = train_data[:, -1]

X_test = test_data[:, 1:-1]
y_test = test_data[:, -1]


## Add column for prices filled with ones

In [17]:
one = np.ones((len(X_train), 1))
X_train = np.append(one, X_train, axis=1)

# reshape y_train to a column vector
y_train = np.array(y_train).reshape((len(y_train),1))

# lets look these :

# do the same with test :
one = np.ones((len(X_test), 1))
X_test = np.append(one, X_test, axis=1)

# reshape y_train to a column vector
y_test = np.array(y_test).reshape((len(y_test),1))


## find beta with normal equation:
beta = (X(transposed)*X)^-1 * (X(transposed)*y) with * as dot product

In [22]:

def normal_equation(X, y):
    X = X + 1e-15
    y = y + 1e-15
    beta = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))
    return beta

def predict(X_test, beta):
    return np.dot(X_test, beta)

In [23]:
beta = normal_equation(X_train, y_train)
predictions = predict(X_test, beta)



In [24]:
print(X_train.dtype)
print(y_train.dtype)

float64
float64


## Check accuracy

In [25]:
def metrics(predictions, y_test):

    # MAE (mean absolute error)
    MAE = np.mean(np.abs(predictions - y_test))

    # MAPE
    epsilon = 1e-8  # Small constant to avoid division by zero
    MAPE = np.mean(np.abs((y_test - predictions) / (y_test + epsilon))) * 100

    # RMSE (root mean squared error)
    MSE = np.square(np.subtract(y_test,predictions)).mean() 
    RMSE = math.sqrt(MSE)

    # r_square
    rss = np.sum(np.square(predictions - y_test))
    mean = np.mean(y_test)
    sst = np.sum(np.square(y_test-mean))
    r_square = 1 - (rss/sst)

    return MAE, MAPE, RMSE, r_square

In [28]:
predictions = zscore_test.deNormalize(predictions)
ytest = zscore_test.deNormalize(y_test)
MAE, MAPE, RMSE, r_square = metrics(predictions, y_test)

In [29]:
print(f"MAE = {MAE}, MAPE = {MAPE}, RMSE = {RMSE}, r_square = {r_square}")

MAE = 211371.05549666632, MAPE = 374413330.63513756, RMSE = 2057398.8714183937, r_square = -406357451146912.3
