# logit-регрессия: Wald-тест, совместная значимость

In [21]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from scipy.stats import chi2 # критические значения chi2-распределения

# Не показывать FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [22]:
# импорт датасета
#df = pd.read_csv('https://raw.githubusercontent.com/artamonoff/econometrica/main/econometrica2/data-csv/SwissLabor.csv')
df = pd.read_csv('https://raw.githubusercontent.com/artamonoff/econometrica-spring23/master/python-notebooks/data-csv/SwissLabor.csv')
# импорт данных из локального файла
# df = pd.read_csv('SwissLabor.csv')
df

Unnamed: 0,participation,income,age,education,youngkids,oldkids,foreign
0,no,10.787497,3.0,8,1,1,no
1,yes,10.524251,4.5,8,0,1,no
2,no,10.968578,4.6,9,0,0,no
3,no,11.104999,3.1,11,2,0,no
4,no,11.108470,4.4,12,0,2,no
...,...,...,...,...,...,...,...
867,no,10.597393,2.4,4,2,1,yes
868,yes,10.377773,4.8,8,0,0,yes
869,no,10.133740,2.4,6,2,0,yes
870,yes,10.932351,4.1,10,0,1,yes


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 872 entries, 0 to 871
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   participation  872 non-null    object 
 1   income         872 non-null    float64
 2   age            872 non-null    float64
 3   education      872 non-null    int64  
 4   youngkids      872 non-null    int64  
 5   oldkids        872 non-null    int64  
 6   foreign        872 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 47.8+ KB


In [24]:
# преобразуем данные столбцов participation и foreign
df['participation'] = df['participation'].replace('yes', 1)
df['participation'] = df['participation'].replace('no', 0)
df['foreign'] = df['foreign'].replace('yes', 1)
df['foreign'] = df['foreign'].replace('no', 0)
df

Unnamed: 0,participation,income,age,education,youngkids,oldkids,foreign
0,0,10.787497,3.0,8,1,1,0
1,1,10.524251,4.5,8,0,1,0
2,0,10.968578,4.6,9,0,0,0
3,0,11.104999,3.1,11,2,0,0
4,0,11.108470,4.4,12,0,2,0
...,...,...,...,...,...,...,...
867,0,10.597393,2.4,4,2,1,1
868,1,10.377773,4.8,8,0,0,1
869,0,10.133740,2.4,6,2,0,1
870,1,10.932351,4.1,10,0,1,1


## Спецификация и подгонка модели
Для датасета `SwissLabour`
рассморим logit-регрессию **participation на income, income^2, age, age^2, youngkids, oldkids, foreign**

In [25]:
mod = smf.logit(formula='participation~income+I(income**2)+age+I(age**2)+youngkids+oldkids+foreign', data=df) # спецификация модели
res = mod.fit() # подгонка модели
res.summary()

Optimization terminated successfully.
         Current function value: 0.583749
         Iterations 6


0,1,2,3
Dep. Variable:,participation,No. Observations:,872.0
Model:,Logit,Df Residuals:,864.0
Method:,MLE,Df Model:,7.0
Date:,"Fri, 24 Nov 2023",Pseudo R-squ.:,0.1539
Time:,17:05:54,Log-Likelihood:,-509.03
converged:,True,LL-Null:,-601.61
Covariance Type:,nonrobust,LLR p-value:,1.579e-36

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-9.4763,17.245,-0.549,0.583,-43.277,24.324
income,1.8753,3.266,0.574,0.566,-4.526,8.277
I(income ** 2),-0.1377,0.155,-0.887,0.375,-0.442,0.166
age,3.4025,0.687,4.955,0.000,2.057,4.748
I(age ** 2),-0.4846,0.085,-5.692,0.000,-0.651,-0.318
youngkids,-1.1813,0.172,-6.858,0.000,-1.519,-0.844
oldkids,-0.2471,0.084,-2.932,0.003,-0.412,-0.082
foreign,1.0728,0.187,5.737,0.000,0.706,1.439


## Тест Вальда: совместная значимость
Тестируем значимость влияния дохода, т.е. $$H_0:\beta_{income}=\beta_{income^2}=0.$$


## Уровень значимости
Пусть уровень значимости $\alpha=5\%=0.05$

In [26]:
# Статистика теста Вальда и её P-значение
res.wald_test('income=I(income ** 2)=0')

<class 'statsmodels.stats.contrast.ContrastResults'>
<Wald test (chi2): statistic=[[24.4405137]], p-value=4.929579593913696e-06, df_denom=2>

### Степени свободы для $\chi^2$-распределения

In [27]:
2

2

### Критическое значение $\chi_{df}^2(\alpha)$

In [28]:
sign_level = 0.05 # уровень значимости
chi2.ppf(q=1-sign_level, df=2).round(3) 

5.991

### Значимо ли влияние дохода?

`влияние дохода значимо` $(Chisq>\chi_{df}^2(\alpha))$