# LPM-модель: t-тест

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_params
from scipy.stats import t # t-распределение

## t-тест: Значимость коэффициентов

Тестируем гипотезу $$H_0:\beta=0$$

Тестовая статистика $$t=\frac{\hat{\beta}}{s.e.(\beta)}$$

Критическое $$t_{cr}=t_{df=n-k-1}(\alpha)$$

Гипотеза отвергается если $|t|>t_{cr}$ или $P<\alpha$

### approve equation 1

Для датасета `loanapp` рассморим регрессию **approve на mortno, unem, dep, male, married, yjob, self**

In [2]:
# подключим датасет loanapp по ссылке 
loanapp_df = pd.read_csv('https://raw.githubusercontent.com/artamonoff/econometrica/main/econometrica2/data-csv/loanapp.csv', na_values=(' ', '', '  '))
loanapp_df.shape

(1989, 59)

In [3]:
#зададим спецификацию модели через формулу
mod_lpm = smf.ols(formula='approve~mortno+unem+dep+male+married+yjob+self', data=loanapp_df)

In [4]:
# подгонка модели с поправкой на гетероскедастичность
res_lpm_hc = mod_lpm.fit(cov_type='HC3')
print(res_lpm_hc.summary(slim=True))

                            OLS Regression Results                            
Dep. Variable:                approve   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.017
No. Observations:                1971   F-statistic:                     5.849
Covariance Type:                  HC3   Prob (F-statistic):           9.77e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8642      0.023     37.135      0.000       0.819       0.910
mortno         0.0733      0.015      4.886      0.000       0.044       0.103
unem          -0.0064      0.004     -1.605      0.108      -0.014       0.001
dep           -0.0185      0.008     -2.429      0.015      -0.033      -0.004
male           0.0019      0.021      0.089      0.929      -0.040       0.044
married        0.0459      0.019      2.458      0.0

### Тестируем гипотезу на уровне значимости 1\% (т.е. $\alpha = 0.01$)

## робастные t-статистики для каждого коэффициента

In [5]:
# робастные t-статистики для каждого коэффициента с округлением до 3-х десятичных знаков
res_lpm_hc.tvalues.round(3)

Intercept    37.135
mortno        4.886
unem         -1.605
dep          -2.429
male          0.089
married       2.458
yjob         -0.107
self         -1.464
dtype: float64

In [6]:
# Число наблюдений в модели, число регрессоров и степени свободы для t_cr
res_lpm_hc.nobs, res_lpm_hc.df_model, res_lpm_hc.df_resid

(1971.0, 7.0, 1963.0)

In [7]:
# Результаты t-теста для коэффициентов (робастные s.e.)
summary_params(res_lpm_hc, alpha=0.01)

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995]
Intercept,0.864212,0.023272,37.134809,7.710752999999999e-302,0.804266,0.924157
mortno,0.073251,0.014993,4.885535,1.031484e-06,0.03463,0.111872
unem,-0.006434,0.004008,-1.605254,0.1084379,-0.016758,0.00389
dep,-0.018472,0.007605,-2.429081,0.01513714,-0.038061,0.001116
male,0.001907,0.021351,0.089319,0.9288281,-0.053089,0.056903
married,0.045946,0.018692,2.458098,0.0139675,-0.002201,0.094093
yjob,-0.000662,0.006189,-0.10705,0.9147495,-0.016603,0.015278
self,-0.03612,0.024671,-1.464087,0.1431703,-0.099667,0.027427


In [8]:
# 1%-критическое значение t-распределения
t_cr = np.round(t.ppf(q=1-0.01/2, df=res_lpm_hc.df_resid), 3)
t_cr

2.578

In [9]:
# проверим значимость коэффициентов используя P-value
df_hc = np.round(summary_params(res_lpm_hc, alpha=0.01), 3)
df_hc['significance'] = df_hc.apply(lambda x: 'Значим' if x['P>|t|']<0.01 else 'Незначим', axis=1)
df_hc

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995],significance
Intercept,0.864,0.023,37.135,0.0,0.804,0.924,Значим
mortno,0.073,0.015,4.886,0.0,0.035,0.112,Значим
unem,-0.006,0.004,-1.605,0.108,-0.017,0.004,Незначим
dep,-0.018,0.008,-2.429,0.015,-0.038,0.001,Незначим
male,0.002,0.021,0.089,0.929,-0.053,0.057,Незначим
married,0.046,0.019,2.458,0.014,-0.002,0.094,Незначим
yjob,-0.001,0.006,-0.107,0.915,-0.017,0.015,Незначим
self,-0.036,0.025,-1.464,0.143,-0.1,0.027,Незначим


In [16]:
# проверим значимость коэффициентов используя t_cr
df_hc = np.round(summary_params(res_lpm_hc, alpha=0.01), 3)
df_hc['significance'] = df_hc.apply(lambda x: 'Значим' if np.abs(x['t'])>t_cr else 'Незначим', axis=1)
df_hc

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995],significance
Intercept,0.864,0.023,37.135,0.0,0.804,0.924,Значим
mortno,0.073,0.015,4.886,0.0,0.035,0.112,Значим
unem,-0.006,0.004,-1.605,0.108,-0.017,0.004,Незначим
dep,-0.018,0.008,-2.429,0.015,-0.038,0.001,Незначим
male,0.002,0.021,0.089,0.929,-0.053,0.057,Незначим
married,0.046,0.019,2.458,0.014,-0.002,0.094,Незначим
yjob,-0.001,0.006,-0.107,0.915,-0.017,0.015,Незначим
self,-0.036,0.025,-1.464,0.143,-0.1,0.027,Незначим


**ВЫВОД**: На уровне значимости 1% значим коэффициент `mortno`

## неробастные t-статистики для каждого коэффициента

In [11]:
# подгонка модели
res_lpm_ols = mod_lpm.fit(cov_type='nonrobust')

In [12]:
# Результаты t-теста для коэффициентов (неробастные s.e.)
summary_params(res_lpm_ols, alpha=0.01)

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995]
Intercept,0.864212,0.02191,39.444211,4.3833729999999995e-251,0.807721,0.920702
mortno,0.073251,0.016,4.578293,4.981615e-06,0.031999,0.114503
unem,-0.006434,0.003463,-1.85797,0.06332293,-0.015363,0.002495
dep,-0.018472,0.007187,-2.570198,0.01023721,-0.037003,5.8e-05
male,0.001907,0.020314,0.093879,0.9252152,-0.050469,0.054283
married,0.045946,0.017644,2.604031,0.009282926,0.000453,0.091439
yjob,-0.000662,0.006686,-0.099092,0.9210751,-0.0179,0.016575
self,-0.03612,0.022289,-1.620504,0.1052847,-0.093589,0.021349


In [13]:
# проверим значимость коэффициентов используя P-value
df_ols = np.round(summary_params(res_lpm_ols, alpha=0.01), 3)
df_ols['significance'] = df_ols.apply(lambda x: 'Значим' if x['P>|t|']<0.01 else 'Незначим', axis=1)
df_ols

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995],significance
Intercept,0.864,0.022,39.444,0.0,0.808,0.921,Значим
mortno,0.073,0.016,4.578,0.0,0.032,0.115,Значим
unem,-0.006,0.003,-1.858,0.063,-0.015,0.002,Незначим
dep,-0.018,0.007,-2.57,0.01,-0.037,0.0,Незначим
male,0.002,0.02,0.094,0.925,-0.05,0.054,Незначим
married,0.046,0.018,2.604,0.009,0.0,0.091,Значим
yjob,-0.001,0.007,-0.099,0.921,-0.018,0.017,Незначим
self,-0.036,0.022,-1.621,0.105,-0.094,0.021,Незначим


In [17]:
# проверим значимость коэффициентов используя t_cr
df_ols = np.round(summary_params(res_lpm_ols, alpha=0.01), 3)
df_ols['significance'] = df_ols.apply(lambda x: 'Значим' if np.abs(x['t'])>t_cr else 'Незначим', axis=1)
df_ols

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.005,0.995],significance
Intercept,0.864,0.022,39.444,0.0,0.808,0.921,Значим
mortno,0.073,0.016,4.578,0.0,0.032,0.115,Значим
unem,-0.006,0.003,-1.858,0.063,-0.015,0.002,Незначим
dep,-0.018,0.007,-2.57,0.01,-0.037,0.0,Незначим
male,0.002,0.02,0.094,0.925,-0.05,0.054,Незначим
married,0.046,0.018,2.604,0.009,0.0,0.091,Значим
yjob,-0.001,0.007,-0.099,0.921,-0.018,0.017,Незначим
self,-0.036,0.022,-1.621,0.105,-0.094,0.021,Незначим


**ВЫВОД**: На уровне значимости 1% значимы коэффициенты: `mortno` и `married`

### Значимость выбранных коэффициентов

Тестируем значимость $\beta_{mortno}$ и $\beta_{male}$

In [15]:
# робастный t-тест
res_lpm_hc.t_test('mortno=0, male=0')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.0733      0.015      4.886      0.000       0.044       0.103
c1             0.0019      0.021      0.089      0.929      -0.040       0.044