# probit-регрессия: Качество подгонки и Сравнение моделей

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col # вывод результатов нескольких регрессий

# Не показывать FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# подключим датасет mroz_Greene по ссылке 
df = pd.read_csv('../../datasets/TableF5-1.csv')
#подключим датасет mroz_Greene из локального файла
#df = pd.read_csv('TableF5-1.csv')
df.head()

Unnamed: 0,LFP,WHRS,KL6,K618,WA,WE,WW,RPWG,HHRS,HA,HE,HW,FAMINC,MTR,WMED,WFED,UN,CIT,AX
0,1,1610,1,0,32,12,3.354,2.65,2708,34,12,4.0288,16310,0.7215,12,7,5.0,0,14
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,9,8.4416,21800,0.6615,7,7,11.0,1,5
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,12,3.5807,21040,0.6915,12,7,5.0,0,15
3,1,456,0,3,34,12,1.0965,3.25,1920,53,10,3.5417,7300,0.7815,7,7,5.0,0,6
4,1,1568,1,2,31,14,4.5918,3.6,2000,32,12,10.0,27300,0.6215,12,14,9.5,1,7


## Спецификация и подгонка

In [3]:
# model 1
mod_1 = smf.probit(formula = 'LFP~WA+I(WA**2)+WE+KL6+K618+CIT+UN+np.log(FAMINC)', data = df)
res_1 = mod_1.fit(disp=False)

In [4]:
# model 2
mod_2 = smf.probit(formula = 'LFP~WE+KL6+K618+CIT+UN+np.log(FAMINC)', data = df)
res_2 = mod_2.fit(disp=False)

In [5]:
# model 3
mod_3 = smf.probit(formula = 'LFP~WA+I(WA**2)+WE+KL6+np.log(FAMINC)', data = df)
res_3 = mod_3.fit(disp=False)

In [6]:
# model 4
mod_4 = smf.probit(formula = 'LFP~WA+I(WA**2)+WE+KL6', data = df)
res_4 = mod_4.fit(disp=False)

In [7]:
# Сравнение моделей
# Имена моделей
mod_names = ['Модель 1', 'Модель 2', 'Модель 3', 'Модель 4']
# порядок регрессоров в таблице
reg_order = ['Intercept', 'WA', 'I(WA ** 2)', 'WE', 'KL6', 'K618', 'CIT','UN', 'np.log(FAMINC)']
# Зависимая переменная LFP
summary_col([res_1, res_2, res_3, res_4], model_names=mod_names, stars=True, regressor_order=reg_order, float_format='%.3f')

0,1,2,3,4
,Модель 1,Модель 2,Модель 3,Модель 4
Intercept,-2.005,-2.673***,-1.435,-0.281
,(1.705),(0.957),(1.667),(1.503)
WA,0.008,,-0.018,-0.006
,(0.070),,(0.069),(0.068)
I(WA ** 2),-0.001,,-0.000,-0.000
,(0.001),,(0.001),(0.001)
WE,0.109***,0.124***,0.109***,0.123***
,(0.024),(0.024),(0.024),(0.022)
KL6,-0.851***,-0.621***,-0.847***,-0.855***


## Качество подгонки. Базовые показатели

### McFadden's $R^2$
$$ 
	R^2_{pseudo}=1-\frac{\log L_{full}}{\log L_{null}}
$$

In [8]:
# pseudoR2 for model 1
res_1.prsquared.round(3)

np.float64(0.102)

In [9]:
# pseudoR2 for model 2
res_2.prsquared.round(3)

np.float64(0.076)

In [10]:
# pseudoR2 for model 3
res_3.prsquared.round(3)

np.float64(0.097)

In [11]:
# pseudoR2 for model 4
res_4.prsquared.round(3)

np.float64(0.095)

### McFadden’s Adjusted $R^2$
$$ 
	R^2_{adj}=1-\frac{\log L_{full}-k-1}{\log L_{null}}
$$

In [12]:
# pseudoR2.adj for model 1
(1-(res_1.llf-res_1.df_model-1)/res_1.llnull).round(3)

np.float64(0.085)

In [13]:
# pseudoR2.adj for model 2
(1-(res_2.llf-res_2.df_model-1)/res_2.llnull).round(3)

np.float64(0.062)

In [14]:
# pseudoR2.adj for model 3
(1-(res_3.llf-res_3.df_model-1)/res_3.llnull).round(3)

np.float64(0.086)

In [15]:
# pseudoR2.adj for model 4
(1-(res_4.llf-res_4.df_model-1)/res_4.llnull).round(3)

np.float64(0.085)

### Cox & Snell $R^2$
$$
	R^2_{C\& S}=1-\left(\frac{L_{null}}{L_{full}}\right)^{2/n}=1-\left(\frac{\exp(\log L_{null})}{\exp(\log L_{full})}\right)^{2/n}=
	1-\exp\left(\frac{2}{n}(\log L_{null}-\log L_{full})\right)=1-\exp\left(-\frac{LR}{n}\right)
$$

In [16]:
# Cox.Snell.R2 for model 1
(1-np.exp(-res_1.llr/res_1.nobs)).round(3)

np.float64(0.13)

In [17]:
# Cox.Snell.R2 for model 2
(1-np.exp(-res_2.llr/res_2.nobs)).round(3)

np.float64(0.099)

In [18]:
# Cox.Snell.R2 for model 3
(1-np.exp(-res_3.llr/res_3.nobs)).round(3)

np.float64(0.125)

In [19]:
# Cox.Snell.R2 for model 4
(1-np.exp(-res_4.llr/res_4.nobs)).round(3)

np.float64(0.122)

### Nagelkerke / Cragg & Uhler $R^2$
$$
	R^2_{N,C\& U}=\frac{1-\left(\frac{L_{null}}{L_{full}}\right)^{2/n}}{1-L_{null}^{2/n}}=
	\frac{1-\exp\left(-\frac{LR}{n}\right)}{1-\exp(2\log L_{null}/n)}
$$

In [20]:
# Nagelkerke.R2 for model 1
((1-np.exp(-res_1.llr/res_1.nobs))/(1-np.exp(2*res_1.llnull/res_1.nobs))).round(3)

np.float64(0.175)

In [21]:
# Nagelkerke.R2 for model 2
((1-np.exp(-res_2.llr/res_2.nobs))/(1-np.exp(2*res_2.llnull/res_2.nobs))).round(3)

np.float64(0.133)

In [22]:
# Nagelkerke.R2 for model 3
((1-np.exp(-res_3.llr/res_3.nobs))/(1-np.exp(2*res_3.llnull/res_3.nobs))).round(3)

np.float64(0.167)

In [23]:
# Nagelkerke.R2 for model 4
((1-np.exp(-res_4.llr/res_4.nobs))/(1-np.exp(2*res_4.llnull/res_4.nobs))).round(3)

np.float64(0.163)

### Efron's $R^2$
$$
	R^2_{Efron}=1-\frac{\sum(y_i-\hat{P}_i)^2}{\sum(y_i-\bar{y})^2}=1-\frac{\sum(y_i-\hat{P}_i)^2}{n Var(y)}
$$

In [24]:
# Efron.R2 for model 1
(1-(np.sum(res_1.resid_response**2))/(res_1.nobs*np.var(mod_1.endog))).round(3)

np.float64(0.133)

In [25]:
# Efron.R2 for model 2
(1-(np.sum(res_2.resid_response**2))/(res_2.nobs*np.var(mod_2.endog))).round(3)

np.float64(0.1)

In [26]:
# Efron.R2 for model 3
(1-(np.sum(res_3.resid_response**2))/(res_3.nobs*np.var(mod_3.endog))).round(3)

np.float64(0.127)

In [27]:
# Efron.R2 for model 4
(1-(np.sum(res_4.resid_response**2))/(res_4.nobs*np.var(mod_4.endog))).round(3)

np.float64(0.123)

### McKelvey & Zavoina's $R^2$

\begin{align*}
	R^2_{logit}&=\frac{Var(\hat{P})}{Var(\hat{P})+\pi^2/3} & R^2_{probit}&=\frac{Var(\hat{P})}{Var(\hat{P})+1}
\end{align*}


In [28]:
#McKelvey.Zavoina.R2 for model 1
y_prob_1 = res_1.predict(mod_1.exog, transform=False)

# probit
(np.var(y_prob_1)/(np.var(y_prob_1)+1)).round(3)

np.float64(0.031)

In [29]:
#McKelvey.Zavoina.R2 for model 2
y_prob_2 = res_2.predict(mod_2.exog, transform=False)

# probit
(np.var(y_prob_2)/(np.var(y_prob_2)+1)).round(3)

np.float64(0.024)

In [30]:
#McKelvey.Zavoina.R2 for model 3
y_prob_3 = res_3.predict(mod_3.exog, transform=False)

# probit
(np.var(y_prob_3)/(np.var(y_prob_3)+1)).round(3)

np.float64(0.03)

In [31]:
#McKelvey.Zavoina.R2 for model 4
y_prob_4 = res_4.predict(mod_4.exog, transform=False)

# probit
(np.var(y_prob_4)/(np.var(y_prob_4)+1)).round(3)

np.float64(0.029)