# Regression Analyses (Practical)

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f as fdist
from scipy.stats import t as tdist
from scipy.stats import chi2 as chisq
from sklearn.base import BaseEstimator, ClassifierMixin
inv = np.linalg.inv


## Implement the Linear Regression model

In [54]:
class LinearReg(BaseEstimator, ClassifierMixin):
    ''' Linear Regression Model Class
    
    Parameters
    ----------
    fit_intercept: bool, optional (default=True)
        Whether to fit the intercept
    '''
    
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
    
    @staticmethod
    def _remake_X(X):
        if not np.all(X == 0):
            return np.hstack((np.ones((X.shape[0], 1)), X))
        else:
            return np.ones((X.shape[0], 1))

    @staticmethod
    def _remake_y(y):
        return y.reshape(-1, 1) # Check before reshaping?

    
    def fit(self, X, y):
        
        # Make proper X and y matrix
        if self.fit_intercept:
            if X is not None:
                X = self._remake_X(X)
            else:
                X = np.ones((y.shape[0], 1))
            
        y = self._remake_y(y)
        
        # Store n and k
        n, k = X.shape
        
        # Obtain estimate of beta =  beta_hat
        try:
            A = inv(X.T@X)
        except np.linalg.LinAlgError:
            A = np.zeros([X.shape[1]]*2)
            
        beta_hat = A@X.T@y
        
        # Obtain estimate of y = y_hat
        y_hat = X@beta_hat
        
        # Compute residual sum of squares
        u_hat = y - y_hat
        rss = (u_hat.T @ u_hat).ravel()[0]
        
        # Estimate the variance of the error terms
        u_var_hat = rss/n-k
        
        # Estimate the variance of beta_hat
        beta_hat_var = u_var_hat*A
        
        # Calculate tss, ess and rss
        tss = y.var()*n
        ess = tss - rss
        
        # Calculate R-squared and Adjusted R-squared
        rsq = ess/tss
        rsq_adj = 1 - (rss/(n-k))/(tss/(n-1))
        
        # Estimated Variance of y_hat
        y_hat_var = self._calc_y_hat_var(X, beta_hat_var)
        
        # Estimated Variance of u_hat
        u_hat_var = y_hat_var + u_var_hat
        
        # Perform F-test for overall significance
        if k != 1:
            dfn, dfd = k-1, n-k
            self.fstat = (ess/(dfn))/(rss/(dfd))
            self.ftest_pval =  1 - fdist.cdf(self.fstat, dfn, dfd)

        # store
        self.n = n
        self.k = k
        self.beta_hat = beta_hat
        self.rss = rss
        self.ess = ess
        self.tss = tss
        self.beta_hat_var = beta_hat_var
        self.u_hat = u_hat
        self.u_var_hat = u_var_hat
        self.rsq = rsq
        self.rsq_adj = rsq_adj
        self.aic = rss/n + 2*k/n
        self.bic = rss/n + 2*k*np.log(n)/n
        self.y_hat_var = y_hat_var
        self.u_hat_var = u_hat_var

        return self
        
    def predict(self, X):
        if self.fit_intercept:
            X = self._remake_X(X)
            
        return (X@self.beta_hat)
    
    @staticmethod
    def _calc_y_hat_var(X, beta_hat_var):
        return np.diag(X@beta_hat_var@X.T).reshape(-1, 1)
    
    def _interval_prediction(self, X, interval_type, conf=0.95):
        res = {}
        res['est'] = self.predict(X).ravel()
        if self.fit_intercept:
            X = self._remake_X(X)
            
        y_hat_var = self._calc_y_hat_var(X, self.beta_hat_var)
        if interval_type == 'prediction':
            var = y_hat_var
        elif interval_type == 'confidence':
            var = y_hat_var + self.u_hat_var
            
        width = (tdist.ppf((1+conf)/2, self.n - self.k)*var).ravel()
        res['low'] = res['est'] - width
        res['high'] = res['est'] + width
        
        return res
        
    def prediction_interval(self, X, conf=0.95):
        return self._interval_prediction(X, 'prediction', conf)
        
    def confidence_interval(self, X, conf=0.95):
        return self._interval_prediction(X, 'confidence', conf)
    
    def score(self, X, y):
        lm = LinearReg(self.fit_intercept)
        lm.fit(X, y)
        return lm.rsq_adj


## Load Data

In [3]:
df = pd.read_csv('data/EXAM_REGRESSION.csv')
df.head()

Unnamed: 0,CS ID,Y,X2,X3,X4,X5
0,1,21.0,160.0,110,3.9,16.46
1,2,21.0,160.0,110,3.9,17.02
2,3,22.8,108.0,93,3.85,18.61
3,4,21.4,258.0,110,3.08,19.44
4,5,18.7,360.0,175,3.15,17.02


In [4]:
def extract_X(X_names=["X2", "X3", "X4", "X5"]):
    X = df[X_names].values    
    if len(X_names) == 1:
        X = X.reshape(-1, 1)
    return X

In [5]:
def extract_y(y_name="Y"):
    y = df[y_name].values.reshape(-1, 1)
    
    return y

In [6]:
X = df.drop(["CS ID", "Y"], axis=1).values
y = df["Y"].values.reshape(-1, 1)

## Specific to general approach

**Specific model:** $ y_{i} = \beta_{1} + \beta_{2}X_{2i} + u_{i}$

In [7]:
X = df.loc[:, "X2"].values.reshape(-1, 1)
m1 = LinearReg().fit(X, y)

In [8]:
m1

LinearReg(fit_intercept=True)

* **Model 1:** $ y_{i} = \beta_{1} + \beta_{2}X_{2i} + u_{i}$
* **Model 2:** $ y_{i} = \beta_{1} + \beta_{2}X_{2i} +  \beta_{3}X_{3i} + u_{i}$
* **Model 3:** $ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$
* **Model 4:** $ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + \beta_{5}X_{5i} + u_{i}$


In [9]:
regrsrs = ["X3", "X4", "X5"]

models = [m1] + [None]*len(regrsrs)

model_regrsrs = [["X2"]] + [""]*len(regrsrs)
lst_regr = ["X2"]
for i, regr in enumerate(regrsrs):
    lst_regr = lst_regr + [regr]
    model_regrsrs[i+1] = lst_regr
    X = df.loc[:, lst_regr].values
    models[i+1] = LinearReg().fit(X, y)
    
    
    

In [10]:
rsq_adj = [m.rsq_adj for m in models]

In [11]:
model_regrsrs

[['X2'], ['X2', 'X3'], ['X2', 'X3', 'X4'], ['X2', 'X3', 'X4', 'X5']]

In [12]:
model_regrsrs_names = list(map(lambda x: ', '.join(x), model_regrsrs))

In [13]:
model_regrsrs_names

['X2', 'X2, X3', 'X2, X3, X4', 'X2, X3, X4, X5']

In [14]:
rsq_adj_df = pd.DataFrame({'rsq_adj': rsq_adj, 'regressors': model_regrsrs_names})

In [15]:
rsq_adj_df

Unnamed: 0,rsq_adj,regressors
0,0.708955,X2
1,0.730877,"X2, X3"
2,0.750907,"X2, X3, X4"
3,0.74354,"X2, X3, X4, X5"


So we select $X_{2}, X_{3}, X_{4} $ as the optimum regressors according to the adjusted R squared values

$ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$

In [16]:
max_index = rsq_adj_df.rsq_adj.argmax()
model = models[max_index]
X = df.loc[:, model_regrsrs[max_index]].values
y = df.loc[:, "Y"].values.reshape(-1, 1)

## Check for heteroskedasticity

In [17]:
from scipy.stats import f as fdist
from scipy.stats import t as tdist
from scipy.stats import chi2 as chisq

class IncompatibleLinearModelError(Exception):
    pass

def Ftest_LinearReg(lm1: LinearReg, lm2: LinearReg, sig):
    '''Perform the Ftest for two linear model fits of class LinearReg
    
    The restricted and unrestricted models are automatically assigned
    by accessing the k attribute of lm1 and lm2.
    
    Parameters
    ----------
    
    lm1, lm2: LinearReg
        fitted models of class LinearReg
        
    sig: int, optional (default=0.05)
        the significance level of the test
        
    Return -> True if the test is significant
    
    '''
    if (lm1.n == lm2.n) and (lm1.k != lm2.k): # NOTE: Python 3.8 assign n to a var 
        n = lm1.n
    else:
        raise(IncompatibleLinearModelError(
            'The linear models were fitted on datasets with varying observations')
             )
    
    lms = [lm1, lm2]
    r = np.argmin([lm.k for lm in lms])
    ur = [i for i in range(len(lms)) if i != r][0]
    
    lm_r, lm_ur = lms[r], lms[ur]
    
    rrss, urss = lm_r.rss, lm_ur.rss
    dfn = lm_ur.k - lm_r.k
    dfd = n - lm_ur.k
    
    F_stat = ((rrss - urss)/dfn)/(urss/dfd)
    critical_val = fdist.ppf(1 - sig, dfn, dfd)
    pval = 1 - fdist.cdf(F_stat, dfn, dfd)

    print('F statistic: {:.4f} with dfn: {}, and dfd: {}'.format(F_stat, dfn, dfd))
    print('Critical value: {:.4f}'.format(critical_val))
    print('p-value: {:.4f}'.format(pval))
    
    if F_stat > critical_val:
        return True
    
    return False


def breusch_pagan(X: np.array, y: np.array, sig):
    ''' Breush Pagan test for heteroskedasticity
    
    Parameters
    ----------
    X: np.array
        Regressor matrix
    
    y: np.array
        Regressand matrix
        
    '''
    lm = LinearReg().fit(X, y)

    # residual
    res = y - lm.predict(X)

    # residual squared
    res_sq = res**2

    # Null Hypothesis
    lm_H0 = LinearReg().fit(None, res_sq)

    # Alternate Hypothesis
    # Now regress the res_sq on X
    lm_H1 = LinearReg().fit(X, res_sq)
    
    return(Ftest_LinearReg(lm_H0, lm_H1, sig))

Conducting Breusch pagan test at $\alpha = 0.05$ significance level

$u_{i}^{2} = \alpha_{1} + \alpha_{2}X_{2i} + \alpha_{3}X_{3i} + + \alpha_{4}X_{4i} + e_{i}$ where $e_{i} \sim N(0, \sigma_{e}^{2})$

$H_{0}: \alpha_{2} = \alpha_{3} = \alpha_{4} = 0$ <br />
$H_{1}: atleast \ one \ \alpha_{i} \neq 0  \ \ \forall \ i = 2, 3, 4$

In [18]:
breusch_pagan(X, y, 0.05)

F statistic: 1.1571 with dfn: 3, and dfd: 28
Critical value: 2.9467
p-value: 0.3436


False

As pvalue > 0.05, so Heteroskedasticity does not exist in our specified model.

## Joint hypothesis testing

**NOTE**: $u_{i} \sim N(0, \sigma^{2})$

$H_{0}: \beta_{2} = 0$ <br/>
$H_{1}: \beta_{2} \lt 0$ 

**Restricted Model:** <br/> <br/>
$ y^{*}_{i} = \beta_{1} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$ 
<br/> 
where $y^{*}_{i} = y_{i}$
<br/>

**Unrestricted Model:** <br/> <br/> 
$ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$
<br/> <br/>

In [19]:
X = extract_X()
y = extract_y()

In [20]:
lm_ur = model # The optimum model obtained above is the unrestricted model

In [21]:
X_star= extract_X(["X3", "X4"])
y_star = y

In [22]:
lm_r = LinearReg().fit(X_star, y_star)

In [23]:
(lm_ur.beta_hat[1] - 0)/(lm_ur.beta_hat_var[1, 1]**(1/2))

array([-3.11905337])

In [24]:
Ftest_LinearReg(lm_r, lm_ur, 0.05)

F statistic: 4.2116 with dfn: 1, and dfd: 28
Critical value: 4.1960
p-value: 0.0496


True

Conducting t-test

In [25]:
sig = 0.05
t_stat = (lm_ur.beta_hat[1] - 0)/(lm_ur.beta_hat_var[1, 1]**(1/2))
n, k = lm_ur.n, lm_ur.k
t_critical = tdist.ppf(q=[sig], df=n-k) # left tail probability
pval = 1 - tdist.cdf(abs(t_stat), n-k)

In [26]:
t_critical

array([-1.70113093])

In [27]:
if abs(t_stat) > abs(t_critical):
    print('The result is significant at alpha = {:.3f}'.format(sig))
    print('Reject Null in favour of the alternate')

The result is significant at alpha = 0.050
Reject Null in favour of the alternate


In [28]:
t_critical, t_stat, pval

(array([-1.70113093]), array([-3.11905337]), array([0.00208777]))

$p-value < 0.05 \implies$ Reject $H_{0}$ in favour of the alternate i.e. $H_{1}$

$H_{0}: \beta_{3} = 4$ <br/>
$H_{1}: \beta_{3} \neq 4$ 

**Restricted Model:** <br/> <br/>
$ y^{*}_{i} = \beta_{1}  + \beta_{2}X_{2i} + \beta_{4}X_{4i} + u_{i}$
<br/>
where $y^{*}_{i} = y_{i} - 4X_{3i}$
<br/>

**Unrestricted Model:** <br/> <br/> 
$ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$
<br/> <br/>

In [29]:
X_star = extract_X(["X2", "X4"])
y_star = y - 4*extract_X(["X3"])

In [30]:
lm_r = LinearReg().fit(X_star, y_star)

Ftest_LinearReg(lm_r, lm_ur, 0.05)

F statistic: 91251.4180 with dfn: 1, and dfd: 28
Critical value: 4.1960
p-value: 0.0000


True

$p-value < 0.05 \implies$ Reject $H_{0}$ in favour of the alternate i.e. $H_{1}$

$H_{0}: 5\beta_{2} + 4\beta_{3} = 10$ <br/>
$H_{1}: 5\beta_{2} + 4\beta_{3} \neq 10$ <br/>

**Restricted Model:** <br/> <br/>
$ y^{*}_{i} = \beta_{1} + \beta_{2}X^{*}_{2i} + \beta_{4}X_{4i} + u_{i}$
<br/>
where $y^{*}_{i} = y_{i} - \frac{10}{4}X_{3i}$ and $X^{*}_{2i} = X_{2i} - \frac{5}{4}X_{3i}$
<br/>

**Unrestricted Model:** <br/> <br/> 
$ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$
<br/> <br/>

In [31]:
X_star = extract_X(["X2", "X4"])
X_star[:, 0] = X_star[:, 0] - (5/4)*extract_X(["X3"]).ravel()
y_star = y - (10/4)*extract_X(["X3"])

lm_r = LinearReg().fit(X_star, y_star)

In [32]:
Ftest_LinearReg(lm_r, lm_ur, 0.05)

F statistic: 80983.3929 with dfn: 1, and dfd: 28
Critical value: 4.1960
p-value: 0.0000


True

$p-value < 0.05 \implies$ Reject $H_{0}$ in favour of the alternate i.e. $H_{1}$

$H_{0}: \beta_{2} = \beta_{3} = 0$ <br/>
$H_{1}: Atleast \ one \ \beta_{i} \neq 0$ 

**Restricted Model:** <br/> <br/>
$ y^{*}_{i} = \beta_{1} + \beta_{4}X_{4i} + u_{i}$
<br/>
where $y^{*}_{i} = y_{i}$
<br/>

**Unrestricted Model:** <br/> <br/> 
$ y_{i} = \beta_{1} + \beta_{2}X_{2i} + \beta_{3}X_{3i} + \beta_{4}X_{4i} + u_{i}$
<br/> <br/>

In [33]:
X_star = extract_X(["X4"])
y_star = y

lm_r = LinearReg().fit(X_star, y_star)

In [34]:
Ftest_LinearReg(lm_r, lm_ur, 0.05)

F statistic: 19.3533 with dfn: 2, and dfd: 28
Critical value: 3.3404
p-value: 0.0000


True

$p-value < 0.05 \implies$ Reject $H_{0}$ in favour of the alternate i.e. $H_{1}$

## Network of friends

In [35]:
friends_graph = {'A': ['E', 'B'],
                 'B': ['A', 'C'],
                 'C': ['B', 'E', 'D'],
                 'D': ['C', 'E'],
                 'E': ['A', 'D', 'C']}

people = list(friends_graph.keys())
n = len(friends_graph.keys())

In [36]:
def are_friends(p1, p2):
    '''
    '''
    if p2 in friends_graph[p1]:
        return True
    
    return False

def get_person(index):
    return people_dict[index]

def get_index(person):
    return [i for i in range(n) if get_person(i) == person][0]

In [37]:
people_dict = dict(zip(range(n), people))

In [38]:
people_dict

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}

In [39]:
friend = np.zeros((n, n))

In [40]:
friend

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

If two persons are friends, put 1 in the matrix $Friend$

In [41]:
for i in range(n):
    for j in range(n):
        if i == j:
            friend[i][j] = 0
        else:
            person1, person2 = get_person(i), get_person(j)
            if are_friends(person1, person2):
                friend[i][j] = 1
    

In [42]:
friend

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 1., 1., 0.]])

$Friend^{2}$ matrix indicates the number of mutual friends. Diagonal elements indicate the number of friends a person has.

In [43]:
friend_sq = friend@friend

In [44]:
friend_sq

array([[2., 0., 2., 1., 0.],
       [0., 2., 0., 1., 2.],
       [2., 0., 3., 1., 1.],
       [1., 1., 1., 2., 1.],
       [0., 2., 1., 1., 3.]])

In [45]:
def get_number_mutual(p1, p2):
    i, j = get_index(p1), get_index(p2)
    
    return int(friend_sq[i][j])
    

In [46]:
get_number_mutual('C', 'E')

1