# LPM-модель: F-тест

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

from scipy.stats import f # критические значения F-распределения

# Не показывать FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# импорт датасета
df = pd.read_csv('loanapp.csv')
df

Unnamed: 0,occ,loanamt,action,msa,suffolk,appinc,typur,unit,married,dep,...,approve,mortno,mortperf,mortlat1,mortlat2,chist,multi,loanprc,thick,white
0,1,89,1,1120,0,72,0,1.0,0.0,0.0,...,1,0,1,0,0,1,0.0,0.754237,0.0,1
1,1,128,3,1120,0,74,0,1.0,1.0,1.0,...,0,0,1,0,0,1,0.0,0.800000,1.0,1
2,1,128,1,1120,0,84,3,1.0,0.0,0.0,...,1,0,1,0,0,1,0.0,0.895105,1.0,1
3,1,66,1,1120,0,36,0,1.0,1.0,0.0,...,1,0,1,0,0,0,0.0,0.600000,0.0,1
4,1,120,1,1120,0,59,8,1.0,1.0,0.0,...,1,0,1,0,0,1,0.0,0.895522,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984,1,158,1,1120,0,96,0,1.0,1.0,0.0,...,1,0,1,0,0,1,0.0,0.897727,0.0,1
1985,1,35,1,1120,0,169,1,1.0,1.0,0.0,...,1,1,0,0,0,1,0.0,0.111111,0.0,1
1986,2,225,1,1120,0,49,0,2.0,1.0,0.0,...,1,0,1,0,0,1,1.0,1.000000,0.0,1
1987,1,98,1,1120,1,110,1,1.0,0.0,0.0,...,1,1,0,0,0,1,0.0,0.455814,0.0,1


## F-тест: значимость регрессии
Для датасета `loanapp` рассморим регрессию **approve на unem, male, yjob, self**

In [3]:
mod = smf.ols(formula='approve~unem+male+yjob+self', data=df)
res_hc = mod.fit(cov_type='HC3') # подгонка модели с поправкой на гетероскедастиность
res_hc.summary(slim=True)

0,1,2,3
Dep. Variable:,approve,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.002
No. Observations:,1974,F-statistic:,1.677
Covariance Type:,HC3,Prob (F-statistic):,0.153

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8914,0.023,39.586,0.000,0.847,0.936
unem,-0.0074,0.004,-1.838,0.066,-0.015,0.000
male,0.0209,0.020,1.056,0.291,-0.018,0.060
yjob,0.0013,0.006,0.210,0.834,-0.011,0.013
self,-0.0298,0.025,-1.205,0.228,-0.078,0.019


Тестируем значимость регрессии, т.е. гипотезу $H_0:\beta_{unem}=\beta_{male}=\beta_{yjob}=\beta_{self}=0$.
<span style="color: blue">Уровень значимости выберем $\alpha=10\%$</span>

Робастная F-статистика и её P-значение

In [4]:
res_hc.fvalue, res_hc.f_pvalue

(1.6774796752587282, 0.15250462834383727)

**Вывод**: регрессия незначима ($P>\alpha$)

Число наблюдений, по которым была оценена модель

In [5]:
res_hc.nobs

1974.0

Степени свободы для F-распределения: $df_1=k, df_2=n-k-1$

In [6]:
res_hc.df_model, res_hc.df_resid

(4.0, 1969.0)

In [7]:
# критическое значение F-распределения с уровнем значимости 10%=0.1
f.ppf(q=1-0.1, dfn=res_hc.df_model, dfd=res_hc.df_resid)

1.9477166136395

In [8]:
# Неправильный (неробастный) F-тест с настройками по умолчанию
res_ols = mod.fit()
res_ols.summary(slim=True)

0,1,2,3
Dep. Variable:,approve,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.002
No. Observations:,1974,F-statistic:,2.029
Covariance Type:,nonrobust,Prob (F-statistic):,0.0878

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8914,0.021,42.284,0.000,0.850,0.933
unem,-0.0074,0.003,-2.122,0.034,-0.014,-0.001
male,0.0209,0.019,1.094,0.274,-0.017,0.058
yjob,0.0013,0.007,0.194,0.846,-0.012,0.014
self,-0.0298,0.022,-1.334,0.182,-0.074,0.014


Какие можно сделать выводы?

## F-тест: совместная значимость
Для датасета `loanapp` рассморим регрессию **approve на appinc, appinc^2, mortno, unem, dep, male, married, yjob, self**

In [9]:
mod = smf.ols(formula='approve~appinc+I(appinc**2)+mortno+unem+dep+male+married+yjob+self', data=df)
res_hc = mod.fit(cov_type='HC3') # подгонка модели с поправкой на гетероскедастиность
res_hc.summary(slim=True)

0,1,2,3
Dep. Variable:,approve,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.021
No. Observations:,1971,F-statistic:,5.032
Covariance Type:,HC3,Prob (F-statistic):,9.81e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8420,0.027,31.003,0.000,0.789,0.895
appinc,0.0005,0.000,1.958,0.050,-4.48e-07,0.001
I(appinc ** 2),-1.007e-06,4.24e-07,-2.374,0.018,-1.84e-06,-1.76e-07
mortno,0.0660,0.015,4.321,0.000,0.036,0.096
unem,-0.0061,0.004,-1.515,0.130,-0.014,0.002
dep,-0.0171,0.007,-2.280,0.023,-0.032,-0.002
male,-0.0029,0.021,-0.135,0.893,-0.045,0.039
married,0.0433,0.019,2.309,0.021,0.007,0.080
yjob,-0.0009,0.006,-0.140,0.889,-0.013,0.011


Число наблюдений, по которым была оценен модель

In [10]:
res_hc.nobs

1971.0

Тестируйте значимость влияния дохода, т.е. $H_0:\beta_{appinc}=\beta_{appinc^2}=0$.
<span style="color: blue">Уровень значимости выберем $\alpha=5\%$</span>.

In [11]:
res_hc.wald_test('appinc=0, I(appinc ** 2)=0', use_f=True)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[2.96075542]]), p=0.052011304302157044, df_denom=1.96e+03, df_num=2>

**Вывод**: влияние незначимо ($P>\alpha$)

Тестовая статистика и её P-значение

In [12]:
res_F_test = res_hc.wald_test('appinc=0, I(appinc ** 2)=0', use_f=True)
res_F_test.fvalue, res_F_test.pvalue

(array([[2.96075542]]), array(0.0520113))

Степеми свободы для F-распределения $df_1=2, dd_2=n-k-1$

In [13]:
res_F_test.df_num, res_F_test.df_denom

(2.0, 1961.0)

In [14]:
# критическое значение F-распределения с уровнем значимости 5%=0.05
f.ppf(q=1-0.05, dfn=res_F_test.df_num, dfd=res_F_test.df_denom)

3.0003133845755365

In [15]:
# Неправильный (неробастный) F-тест с настройками по умолчанию
res_ols = mod.fit()
res_ols.wald_test('appinc=0, I(appinc ** 2)=0', use_f=True)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[5.1487468]]), p=0.005885429653243116, df_denom=1.96e+03, df_num=2>

Какой можно сделать вывод?