## An Object Oriented Approach for Linear Regression using Ordinary Least Squares

In [1]:
import os
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split

In [2]:
path = r'E:\1MS\Projects\LinearRegression_using_OLS'
os.chdir(path)

In [3]:
def read_approximation_files(fname, N, M):
    """ This function reads files """
    
    try:
        with open(fname) as f:
            data = []
            for line in f:
                data.extend(line.split())
                
    except IOError:
        print("Cannot open file")
        sys.exit(1)
        
    data = [float(i) for i in data]
    data = np.reshape(data, (len(data)//(N+M), (N+M)))
    x = data[:, :N]
    t = data[:, N:]
    Nv = len(x)
    
    return x, t, Nv

    
class Standardize():
    """ This class scales the data to have zero mean and unit variance """
    
    def fit(self, x):
        self.mean = x.mean(axis=0)
        self.std = x.std(axis=0)
        return 
        
    
    def transform(self, x):
        x_t = x - self.mean
        x_t = x_t / self.std
        return x_t
        
        
    def fit_transform(self, x):
        self.fit(x)
        x_t = self.transform(x)
        return x_t


def add_constant(x):
    """ This functions adds the intercept/constant """
    
    n = np.shape(x)[0]
    xa = np.concatenate( (np.ones((n,1), dtype=float) , x),1)
    return xa
    

def calc_error(y, y_pred, metric='mse'):
    """ 
        Calculates the error between actual and predicted values.
        Uses mean squared error metric by default. 
        By passing metric='sse', sum of squared error metric can be used
    """
    
    Nv_ = np.shape(y)[0]
    loss = y - y_pred
    
    if metric == 'sse':
        feature_error = sum(loss**2)/Nv_
        try:
            error = sum(feature_error)
        except TypeError: # if there's only one dependent variable 
            error = feature_error
    
    elif metric == 'mse':
        error = np.mean(loss**2)

    else:
        raise ValueError(f'Metric {metric} not supported. Expected one of sse, mse')
        
    return error


#Helper functions    
def get_source(lib):
    ''' Print the source code of required library '''
    
    import inspect
    print(inspect.getsource(lib))
    
    
def gs(x, r=False):
    """ gs: get shape - Print or return the shape of given array """
    
    if r:
        return np.shape(x)
    print(np.shape(x))
    
    
def df(x):
    """ df- returns input as a dataframe """
    
    return pd.DataFrame(x)

class NotFittedError(AttributeError):
    """ Raises error if the model is not fitted before calling predict. """

## Class OLS_LR 

In [4]:
class OLS_LR:
    """ 
        Ordinary Least Squares - Linear Regression
    
        Performs OLS using three methods: 
            1. Directly solving Normal Equations (NE).
            2. QR decomposition (QR).
            3. Singular Value Decomposition (SVD).
            
        By default, the method is set to SVD.
    """
    
    check_fit_flag = False
    
    def __init__(self, method='SVD'):
        self.method = method
    
    
    def fit_OLS(self, X, y):
        """ Fit OLS """

        if self.method == 'NE':
            self.coef_ = self.ne_OLS(X, y)
            
        elif self.method == 'QR':
            self.coef_ = self.qr_OLS(X, y)
            
        elif self.method == 'SVD':
            self.coef_ = self.svd_OLS(X, y)
            
        else:
            msg = f'The method {self.method} is not supported. Expected one of: NE, QR, SVD'
            raise ValueError(msg)
        
        self.check_fit_flag = True
        
        return
    
    
    def predict(self, X):
        """ Predict values using coefficients. """

        if not self.check_fit_flag:
            msg = 'OLS_LR is not fitted yet. Call \'fit_OLS\' first.'
            raise NotFittedError(msg)
        
        return  np.dot(X, self.coef_)
        
        
    def ne_OLS(self, x, y):
        """ Returns coefficients by solving normal equations directly. """
    
        Nv = np.shape(x)[0]
        R = np.dot(x.T, x)/Nv
        C = np.dot(x.T, y)/Nv

        if R.ndim < 2:
            coef = C/R
        else:
            coef = np.dot(np.linalg.inv(R), C)

        return coef
        

    def qr_OLS(self, x, y):
        """ This function uses QR decomposition to solve the normal equations and returns the coefficients. """

        Nv = np.shape(x)[0]
        Q, R = np.linalg.qr(x) #factorization of x into Q and R
        C = np.dot(Q.T, y)

        coef = np.dot(np.linalg.inv(R), C)

        return coef
        
        
    def svd_OLS(self, x, y):
        """ 
            This function uses SVD decomposition and pseudoinverse to solve 
            the normal equations and returns the coefficients.
        """

        Nv = np.shape(x)[0]
        u, s, vt = scipy.linalg.svd(x, full_matrices=False)

        s_nz = s[s>0] #Selecting the non-zero singular values 
        s_inv = np.array([1/i for i in s_nz]) # Inverting the non-zero singular values. 

        m, n = len(s), len(s_inv)
        if m != n:
            s_inv = np.append(s_inv, np.zeros((m-n,1),float))

        s_inv_ = np.diag(s_inv) # converting 1D array to 2-D diagonal matrix

        pseudo_inv = vt.T @ s_inv_ @ u.T 

        coef = np.dot(pseudo_inv, y)

        # One can directly use scipy or numpy's linalg.pinv(x) 
        # coef = np.dot( np.linalg.pinv(x), y)

        return coef
    

----

Loading the data file

In [5]:
fname = 'Twod.tra'
N, M = 8, 7

In [6]:
x, y, Nv = read_approximation_files(fname, N, M)

In [7]:
gs(x)
gs(y)

(1768, 8)
(1768, 7)


Splitting the data into training and validation

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=3, shuffle=True)

In [9]:
gs(x_train)
gs(y_train)
gs(x_val)
gs(y_val)

(1326, 8)
(1326, 7)
(442, 8)
(442, 7)


In [10]:
df(x_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0
mean,-15.689281,-15.857155,-19.715648,-20.965447,-22.497912,-25.395404,-25.437874,-31.304241
std,3.45074,3.443045,2.648086,2.592289,2.584743,2.587408,2.954021,2.883025
min,-25.5526,-25.7402,-27.5819,-29.0151,-32.4309,-36.4308,-36.5809,-43.9265
25%,-18.141025,-18.3049,-21.47145,-22.534475,-24.02805,-26.892475,-27.196575,-32.85505
50%,-15.8931,-16.06635,-19.8323,-21.05965,-22.39165,-24.99605,-25.38965,-30.802
75%,-13.269175,-13.4516,-17.95875,-19.266625,-20.89,-23.749375,-23.725775,-29.527925
max,-6.57133,-6.75256,-12.5587,-13.9876,-15.5273,-19.0372,-17.2063,-24.2229


Scaling the data

In [11]:
train_scale = Standardize()
x_train_scaled = train_scale.fit_transform(x_train)

In [12]:
df(x_train_scaled).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0,1326.0
mean,-1.035002e-14,-1.160124e-15,4.947274e-15,-9.380129e-15,1.081019e-14,6.446996e-15,2.151455e-15,-3.248616e-15
std,1.000377,1.000377,1.000377,1.000377,1.000377,1.000377,1.000377,1.000377
min,-2.859399,-2.87152,-2.971664,-3.106401,-3.84438,-4.266647,-3.773579,-4.379782
25%,-0.7107663,-0.7111927,-0.663296,-0.6054958,-0.5922116,-0.5788168,-0.5955831,-0.5381132
50%,-0.05908771,-0.06078163,-0.04406811,-0.03635347,0.04112692,0.1544036,0.01633087,0.1742722
75%,0.7015941,0.6989344,0.66371,0.6555838,0.6223129,0.6364092,0.579801,0.6163619
max,2.643315,2.645342,2.703707,2.692786,2.697847,2.458291,2.787617,2.457146


We can observe, the training data is now zero mean and of unit variance

In [13]:
x_val_scaled = train_scale.transform(x_val)

In [14]:
# Adding constant/intercept
x_train_ = add_constant(x_train)
x_val_ = add_constant(x_val)

In [15]:
gs(x_train_)
gs(x_val_)

(1326, 9)
(442, 9)


----
## Training

#### 1. OLS - directly solving Normal Equations

In [16]:
ols_ = OLS_LR(method='NE')
ols_.method

'NE'

In [17]:
# Fitting the training data to the OLS estimator
ols_.fit_OLS(x_train_, y_train)

In [18]:
# Predicting values for the validation set
preds_ols = ols_.predict(x_val_)

In [19]:
# Calculating error between validation targets and predicted values
calc_error(y_val, preds_ols, metric='mse')

0.04806708111992421

----
#### 2. OLS using QR decomposition

In [20]:
ols_ = OLS_LR(method='QR')
ols_.method

'QR'

In [21]:
ols_.fit_OLS(x_train_, y_train)
preds_ols = ols_.predict(x_val_)
calc_error(y_val, preds_ols, metric='mse')

0.04806708112046315

---
#### 3. OLS using SVD and PseudoInverse

In [22]:
ols_ = OLS_LR(method='SVD')
ols_.method

'SVD'

In [23]:
ols_.fit_OLS(x_train_, y_train)
preds_ols = ols_.predict(x_val_)
calc_error(y_val, preds_ols, metric='mse')

0.0480670811204622

---
### Comparing with sklearn's LinearRegression

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
lr = LinearRegression(fit_intercept=False)
lr.fit(x_train_, y_train)

LinearRegression(fit_intercept=False)

In [26]:
preds_lr = lr.predict(x_val_)

In [27]:
error_lr = calc_error(y_val, preds_lr, metric='mse')
error_lr

0.04806708112046317

---
## Testing

- Fitting the complete data (x) and testing on actual test data

In [28]:
fname_test = 'Twod.tst'
N, M = 8, 7

In [29]:
x_test, y_test, Nv_test = read_approximation_files(fname_test, N, M)

In [30]:
gs(x_test)
gs(y_test)

(1000, 8)
(1000, 7)


#### Processing data like earlier

In [31]:
test_scale = Standardize()
x_scaled = test_scale.fit_transform(x) # x is the complete training data loaded earlier

In [32]:
gs(x)
gs(y)

(1768, 8)
(1768, 7)


In [33]:
x_test_scaled = test_scale.transform(x_test)

In [34]:
x_ = add_constant(x_scaled)
x_test_ = add_constant(x_test_scaled)

#### OLS - SVD

In [35]:
ols_test = OLS_LR() #No parameter 'method' passed, should select method 'SVD' by default.
ols_test.method

'SVD'

In [36]:
ols_test.fit_OLS(x_, y)
preds_ols_test = ols_test.predict(x_test_)
calc_error(y_test, preds_ols_test, metric='mse')

0.04987707320312501

#### sklearn's Linear Regression

In [37]:
lr_x = LinearRegression(fit_intercept=False)
lr_x.fit(x_, y)

LinearRegression(fit_intercept=False)

In [38]:
preds_lr_test = lr_x.predict(x_test_)

In [39]:
error_lr_test = calc_error(y_test, preds_lr_test, metric='mse')
error_lr_test

0.04987707320312508

This notebook was to demonstrate how we can implement the OLS from previous notebook in an Object Oriented manner.