In [1]:
# Import package for getting dataset example
import wooldridge as woo

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

import scipy.stats as stats

import math

  from pandas.core import (


# Introduction

### Classical Linear Model (CLM) Assumptions

1. The true model follows:
> $$
y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \cdots + \beta_k x_k + \text{e}
$$


2. There are random sample of n observations from population {$(x_{i,1}, x_{i,2}, ..., x_{i,k}, y_i) : i = 1, 2, 3, ..., n$}

3. No perfect collinearity 
> It allows the regressors to be correlated, they just cannot be perfectly linear correlated.

4. Zero conditional mean
> The error term $\text{e}$ has an expected value of zero given any values of the regressors:
> $$ E (e | x_1, x_2, ..., x_k) = 0 $$

5. Homoskedasticity (**It doesn't make the estomator bias**, but it makes less precise estimator and less accurate hypothesis tests)
> $$ Var(e | x_1, x_2, ..., x_k) = \sigma $$

6. Normality (**It doesn't make the estimator being bias**, it makes the sampling distributions of the estimator $\hat{\beta}_j$ traceable)
> The population error $\text{e}$ is independent of the explanatory variables $x_1, x_2, \dots, x_k$ and is normally distributed with $\text{e} \sim \text{Normal}(0, \sigma^2)$

NOTE:
- It very often happens that the error term from regression model have a distribution that is not excessively skewed and with no outliers.
- If the model is homoskedasticity, then the Central Limit Theorem (CLM) ensures that the error term asymptotically normal when the sample is large enough (commonly minimum 30 size samples).

### Theorem: Normal Sampling Distributions
> Under the CLM assumptions, conditional on the sample values of the independent variables,
$$
\hat{\beta}_{j} \sim \text{Normal}(\beta_{j}, \text{Var}(\hat{\beta}_{j}))
$$

# The t-test

![image](images/4_t-diagram.png)

![image](images/4_t-table1.png)

![image](images/4_t-table2.png)

### Confidence Interval

$$
\hat{\beta}_j \pm \text{t}_{crit} \text{se}({\hat{\beta}_j})
$$

where,
- $ \text{se}(\hat{\beta}_j) = \frac{\hat{\sigma}_{rg}}{\sqrt{n} \sqrt{1 - R_j^2} \, sd(x_j)} $
- $sd(x_j) = \sqrt{1/n \sum_{i=1}^{n} (x_{i, j} - \bar{x})^2}$

### Case Estimating t-value and p-value

![image](images/Example_4-1.png)

In [2]:
# Using summary model
wage1 = woo.dataWoo('wage1')

model = smf.ols(formula='np.log(wage) ~ educ + exper + tenure', data=wage1).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(wage)   R-squared:                       0.316
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     80.39
Date:                Sat, 31 Aug 2024   Prob (F-statistic):           9.13e-43
Time:                        16:12:26   Log-Likelihood:                -313.55
No. Observations:                 526   AIC:                             635.1
Df Residuals:                     522   BIC:                             652.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2844      0.104      2.729      0.0

In [70]:
# Calculating manual (This model assume it's two-tail hypothesis testing)

regressors = ['educ', 'exper', 'tenure']
target = 'wage'
n = int(wage1.shape[0])
df = n - len(regressors) - 1

params = {}
se_params = {}
t_values = {}
p_values = {}

# Extract X and y
X = pd.DataFrame(np.ones(n, dtype=int))
X = pd.concat([X, wage1[regressors]], axis=1)
y = wage1[target].apply(lambda x: np.log(x))

# Parameters estimates
X = np.array(X)
y = np.array(y).reshape(n, 1)
b = np.linalg.inv(X.T@X) @ X.T @ y

# Store params
for i in range(b.shape[0]):
    if i == 0:
        params['intercept'] = b[i, 0]
    else:
        params[f'{regressors[i - 1]}'] = b[i, 0]
print(f"Beta: \n{params}")

# Residuals, estiamted variance of residuals and SER
residuals = y - X @ b
var_residuals = (residuals.T @ residuals) / df
SER = np.sqrt(var_residuals)[0][0]

# Estimated variance of the parameters estiamtors and SE
var_beta = var_residuals * np.linalg.inv(X.T @ X)
std_error_beta = np.sqrt(np.diagonal(var_beta))

# Store std error beta
for i in range(len(std_error_beta)):
    if i == 0:
        se_params['intercept'] = std_error_beta[i]
    else:
        se_params[f'{regressors[i - 1]}'] = std_error_beta[i]
    
print(f"Std Error Params: \n", se_params)

# Regressors
for variable in ['intercept'] + regressors:
    t_value_temp = params[f'{variable}'] / se_params[f"{variable}"]
    t_values[f'{variable}'] = t_value_temp
    p_values[f'{variable}'] = 2 * (1 - stats.t.cdf(t_value_temp, df))
    
    
print("t-values:\n", t_values)
print("p-values:\n", p_values)

Beta: 
{'intercept': 0.2843595552360746, 'educ': 0.0920289867692827, 'exper': 0.004121109045609959, 'tenure': 0.022067217434724347}
Std Error Params: 
 {'intercept': 0.10419037797067718, 'educ': 0.0073299232744946695, 'exper': 0.0017232772008454382, 'tenure': 0.0030936491910178287}
t-values:
 {'intercept': 2.7292304795756026, 'educ': 12.555245576649948, 'exper': 2.3914371080799692, 'tenure': 7.133070387810876}
p-values:
 {'intercept': 0.006562462394036572, 'educ': 0.0, 'exper': 0.01713562312471195, 'tenure': 3.2944758032726895e-12}


### Case Confidence Interval

![image](images/Example_4-8.png)

In [108]:
rdchem = woo.dataWoo('rdchem')
model = smf.ols(formula='np.log(rd) ~ np.log(sales) + profmarg', data=rdchem).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             np.log(rd)   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     162.2
Date:                Sun, 01 Sep 2024   Prob (F-statistic):           1.79e-16
Time:                        00:11:07   Log-Likelihood:                -22.511
No. Observations:                  32   AIC:                             51.02
Df Residuals:                      29   BIC:                             55.42
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -4.3783      0.468     -9.355

In [114]:
# Calculate manual

rdchem = woo.dataWoo('rdchem')
regressors = ['sales', 'profmarg']
target = 'rd'
n = rdchem.shape[0]
df = n - len(regressors) - 1
alpha = np.array([0.05, 0.01])

# Storage
params = {}
se_params = {}
CI_params = {}


# Calculate critical values
c_values = stats.t.ppf(1 - alpha / 2, df)

# Extract X and y
X = pd.DataFrame(np.ones(n, dtype=int))
X = pd.concat([X, rdchem[regressors]], axis=1)
X['sales'] = X['sales'].apply(lambda x: np.log(x))

y = rdchem[target].apply(lambda x: np.log(x))

# Parameters estimates
X = np.array(X)
y = np.array(y).reshape(-1, 1)
b = np.linalg.inv(X.T @ X) @ X.T @ y
# Store params
for i in range(b.shape[0]):
    if i == 0:
        params['intercept'] = b[i, 0]
    else:
        params[f'{regressors[i - 1]}'] = b[i, 0]

# Estimate std error parameters
residuals = y - X @ b
var_residuals = (residuals.T @ residuals) / df
var_beta = var_residuals * np.linalg.inv(X.T @ X)
std_error_beta = np.sqrt(np.diagonal(var_beta))

# Store std error beta
for i in range(len(std_error_beta)):
    if i == 0:
        se_params['intercept'] = std_error_beta[i]
    else:
        se_params[f'{regressors[i - 1]}'] = std_error_beta[i]

# Calculate Confidence Interval
for i in range(len(c_values)):
    temp = {}
    c = c_values[i]
    for v in ['intercept'] + regressors:
        temp[v] = (params[v] - c * se_params[v], params[v] + c * se_params[v])
    CI_params[alpha[i]] = temp

print(CI_params)

{0.05: {'intercept': (-5.335478449854297, -3.421068063286227), 'sales': (0.9611072560097991, 1.2073324801318692), 'profmarg': (-0.004487721638015394, 0.04779910329394195)}, 0.01: {'intercept': (-5.668312696315954, -3.0882338168245704), 'sales': (0.9182992000644719, 1.2501405360771964), 'profmarg': (-0.013578168544385521, 0.05688955020031207)}}


# The F-Test 
> It usually used for testing multiple linear regression. 

NOTE: In this case, we use Right-tailed test.

**Define Problem**

Let we define:
- Unrestricted model with $k$ independent: $y_{ur} = \beta^{ur}_{0} + \beta^{ur}_{1}x_1 + \dots + \beta^{ur}_{k}x_k + e_{ur}$
- Restricted model with $q$ exclusion restrictions: $y_{r} = \beta^{r}_{0} + \beta^{r}_{1}x_1 + \dots + \beta^{r}_{k-q}x_{k-q} + e_{r}$ 

Then we have F statistic:
$$
F \equiv \frac{(SSR_r - SSR_{ur}) / q}{SSR_{ur} / (n - k - 1)}
$$

where,
- $SSR_r = \text{Sum of Square Residuals Restricted Model}$ 
- $SSR_{ur} = \text{Sum of Square Residuals Unrestricted Model}$
- $q = df_r - df_{ur}$
- $df_r = \text{Degrees of Freedom Restricted Model}$
- $df_{ur} = \text{Degrees of Freedom Unrestricted Model}$ 
- $n = \text{Number of observations}$ 
- $k = \text{Number of regressors}$


NOTE:
- $SSR_r \ge SSR_{ur}$ it means the results never negative. The minimum result is zero.


**Hypothesis Testing**

![image](images/4_f-table1.png)

### Case
- Unrestricted model:
> np.log(salary) = years + gamesyr + bavg +  hrunsyr + rbisyr + $e_{ur}$
- Restricted model:
> np.log(salary) = years + gamesyr + $e_r$

- Restricted regressors: bavg, hrunsyr, rbisyr

In [119]:
mlb1 = woo.dataWoo('mlb1')
n = mlb1.shape[0]
q = 3
alpha = .01

# unrestricted OLS regression
model_ur = smf.ols(formula='np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr', 
                   data=mlb1).fit()

ssr_ur = np.sum(np.power(model_ur.resid, 2))

# restricted OLS regression
model_r = smf.ols(formula='np.log(salary) ~ years + gamesyr',
                   data=mlb1).fit()
ssr_r = np.sum(np.power(model_r.resid, 2))

# F-testing
F_score = (ssr_r - ssr_ur) / ssr_ur * (n - 5 - 1) / (q)
F_critical = stats.f.ppf(1 - alpha, q, n - 5 - 1)
print(f"F_score: {round(F_score, 3)}")
print(f"F_critical: {round(F_critical, 3)}")
print(f"F_score > F_critical = {F_score > F_critical}\n")

# P-value
p_value = 1 - stats.f.cdf(F_score, q, n - 5 - 1)
print(f"p-value: {round(p_value, 3)}")
print(f"alpha: {alpha}")
print(f"p_value <= alpha = {p_value < alpha}")

F_score: 9.55
F_critical: 3.839
F_score > F_critical = True

p-value: 0.0
alpha: 0.01
p_value <= alpha = True


In [124]:
# Using built in package "automatic"

mlb1 = woo.dataWoo('mlb1')

# OLS regression
model = smf.ols(formula='np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr',
               data=mlb1).fit()

# Automated F test:
hypotheses = ['bavg = 0', 'hrunsyr = 0', 'rbisyr = 0']
ftest = model.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f'fstat: {fstat}\n')
print(f'fpval: {fpval}\n')

fstat: 9.550253521951765

fpval: 4.47370813983966e-06

