# LPM-модель: t-тест

In [19]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

from scipy.stats import t # критические значения t-распределения

In [3]:
# импорт датасета
df = pd.read_csv('loanapp.csv')
df

Unnamed: 0,occ,loanamt,action,msa,suffolk,appinc,typur,unit,married,dep,...,approve,mortno,mortperf,mortlat1,mortlat2,chist,multi,loanprc,thick,white
0,1,89,1,1120,0,72,0,1.0,0.0,0.0,...,1,0,1,0,0,1,0.0,0.754237,0.0,1
1,1,128,3,1120,0,74,0,1.0,1.0,1.0,...,0,0,1,0,0,1,0.0,0.800000,1.0,1
2,1,128,1,1120,0,84,3,1.0,0.0,0.0,...,1,0,1,0,0,1,0.0,0.895105,1.0,1
3,1,66,1,1120,0,36,0,1.0,1.0,0.0,...,1,0,1,0,0,0,0.0,0.600000,0.0,1
4,1,120,1,1120,0,59,8,1.0,1.0,0.0,...,1,0,1,0,0,1,0.0,0.895522,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984,1,158,1,1120,0,96,0,1.0,1.0,0.0,...,1,0,1,0,0,1,0.0,0.897727,0.0,1
1985,1,35,1,1120,0,169,1,1.0,1.0,0.0,...,1,1,0,0,0,1,0.0,0.111111,0.0,1
1986,2,225,1,1120,0,49,0,2.0,1.0,0.0,...,1,0,1,0,0,1,1.0,1.000000,0.0,1
1987,1,98,1,1120,1,110,1,1.0,0.0,0.0,...,1,1,0,0,0,1,0.0,0.455814,0.0,1


## t-тест: значимость коэффициентов
Для датасета `loanapp` рассморим регрессию **approve на mortno, unem, dep, male, married, yjob, self**

In [5]:
mod = smf.ols(formula='approve~mortno+unem+male+married+yjob+self', data=df)
res_hc = mod.fit(cov_type='HC3') # подгонка модели с поправкой на гетероскедастиность
res_hc.summary(slim=True)

0,1,2,3
Dep. Variable:,approve,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.014
No. Observations:,1971,F-statistic:,6.345
Covariance Type:,HC3,Prob (F-statistic):,1.26e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8625,0.023,36.897,0.000,0.817,0.908
mortno,0.0693,0.015,4.729,0.000,0.041,0.098
unem,-0.0066,0.004,-1.644,0.100,-0.014,0.001
male,-0.0002,0.021,-0.010,0.992,-0.042,0.042
married,0.0319,0.018,1.789,0.074,-0.003,0.067
yjob,0.0003,0.006,0.054,0.957,-0.012,0.012
self,-0.0373,0.025,-1.511,0.131,-0.086,0.011


In [21]:
# тестовые t-статистики для кажлого коэффциента с округленим
res_hc.tvalues.round(3)

Intercept    36.897
mortno        4.729
unem         -1.644
male         -0.010
married       1.789
yjob          0.054
self         -1.511
dtype: float64

In [24]:
# P-значения для t-статистик с округленим
res_hc.pvalues.round(4)

Intercept    0.0000
mortno       0.0000
unem         0.1002
male         0.9923
married      0.0735
yjob         0.9573
self         0.1308
dtype: float64

Пусть <span style="color: blue">уровень значимости $\alpha=1\%$</span>. Какие коэффиценты значимы? Те, для которых $P<\alpha$: только *mortno*

Число наблюдений, по которым была оценена модель

In [25]:
res_hc.nobs

1971.0

Степени свободы для t-распределения $df=n-k-1$

In [26]:
res_hc.df_resid

1964.0

In [28]:
# критическое значение t-распределения с уровнем значимости 1%=0.01
t.ppf(q=1-0.01/2, df=res_hc.df_resid)

2.5783349293635474

In [29]:
# Неправильный (неробастный) t-тест с настройками по умолчанию
res_ols = mod.fit()
res_ols.summary(slim=True)

0,1,2,3
Dep. Variable:,approve,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.014
No. Observations:,1971,F-statistic:,5.65
Covariance Type:,nonrobust,Prob (F-statistic):,7.88e-06

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8625,0.022,39.329,0.000,0.820,0.906
mortno,0.0693,0.016,4.347,0.000,0.038,0.101
unem,-0.0066,0.003,-1.900,0.058,-0.013,0.000
male,-0.0002,0.020,-0.010,0.992,-0.040,0.040
married,0.0319,0.017,1.900,0.058,-0.001,0.065
yjob,0.0003,0.007,0.049,0.961,-0.013,0.013
self,-0.0373,0.022,-1.671,0.095,-0.081,0.006


Какие коэффициенты значимы?