In [239]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as st


def ess(estimate_v, sample_v):
    sample_mean = np.mean(sample_v)
    return sum((estimate_v[i] - sample_mean)**2 for i in range(len(estimate_v)))


def rss(estimate_v, sample_v):
    return sum((estimate_v[i] - sample_v[i])**2 for i in range(len(estimate_v)))


def tss(sample_v):
    sample_mean = np.mean(sample_v)
    return sum((sample_v[i] - sample_mean)**2 for i in range(len(sample_v)))

In [197]:
data = pd.read_csv("data.csv", usecols=[
                   'class1_v10', 'class2_v10', 'class3_v10', 'sex_v10', 'survived_v10'])
data.rename(inplace=True, columns={'class1_v10': 'class1',
                                   'class2_v10': 'class2',
                                   'class3_v10': 'class3',
                                   'sex_v10': 'sex',
                                   'survived_v10': 'survived'})
data = data[pd.notnull(data['survived'])]
data['intercept'] = 1

Имеется выборка из взрослых пассажиров титаника, у каждого следующие характеристики:  
survived - выжил ли пассажир (1, если да, иначе 0)  
sex - пол (1 для мужчин, 0 для женщин)  
class1 - 1 для пассажиров 1 класса, иначе 0  
class2 - 1 для пассажиров 2 класса, иначе 0  
class3 - 1 для пассажиров 3 класса, иначе 0  

# Часть I
### Оценим модель логит-регрессии для вероятности выжить в зависимости от пола и класса:  
$\frac{P(survived=1)}{P(survived=0)}=exp(\beta_1 + \beta2\cdot sex + \beta_3\cdot class1+\beta_4\cdot class2)$

In [330]:
# Третий регрессор class3 не рассматривается, т.к. между
# class1, class2 и class3 существует мультиколлинеарность.
logit_model = smf.logit(formula="survived ~ sex + class1 + class2", data=data)
logit_result = logit_model.fit()
logit_fitted_list = [1 if f > 0 else 0 for f in logit_result.fittedvalues]
logit_result.summary()

Optimization terminated successfully.
         Current function value: 0.468990
         Iterations 6


0,1,2,3
Dep. Variable:,survived,No. Observations:,1131.0
Model:,Logit,Df Residuals:,1127.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 06 Jun 2017",Pseudo R-squ.:,0.2849
Time:,13:25:45,Log-Likelihood:,-530.43
converged:,True,LL-Null:,-741.76
,,LLR p-value:,2.7369999999999997e-91

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3926,0.138,2.855,0.004,0.123,0.662
sex,-2.5395,0.159,-15.978,0.000,-2.851,-2.228
class1,1.6668,0.182,9.134,0.000,1.309,2.024
class2,0.5534,0.196,2.824,0.005,0.169,0.937


Проверим значимость модели регресси в целом.  
$H_0: \beta_2=\beta_3=\beta_4=0$  
Для этого ипользуем тест отношения правдоподобия:  
$LR = -2\log(\frac{L\:|\: H_0}{L\:|\:H_A}) \sim \chi^2_k$, где $L$ — значение функции правдоподобия модели  
$L = \prod_{i=1}^{n}p^{Y_i}(1-p)^{1-Y_i} = p^{\sum_{i=1}^nY_i}(1-p)^{n-\sum_{i=1}^nY_i}$

In [307]:
n = len(data['survived'])
k = 4
LR = -2*(logit_result.llnull - logit_result.llf)
LR_crit = st.chi2.ppf(0.95, k)
print("LR =", LR)
print("LR_crit =", LR_crit)

LR = 422.65656195
LR_crit = 9.48772903678


$LR > LR_{crit}$, значит, отвергаем нулевую гипотезу.
$\:$  
$\:$  
$\:$  
$\:$  
$\:$  
$\:$  
### Далее оценим модель пробит-регрессии:  
$P(survived=1) = Ф(\beta_1 + \beta_2\cdot sex + \beta_3\cdot class1+\beta_4\cdot class2)$,  
где Ф - функция нормального распределения  
$Ф^{-1}(P(survived=1)) = \beta_1 + \beta_2\cdot sex + \beta_3\cdot class1+\beta_4\cdot class2$

In [329]:
probit_model = smf.probit(formula="survived ~ sex + class1 + class2", data=data)
probit_result = probit_model.fit()
probit_fitted_list = [1 if f > 0 else 0 for f in probit_result.fittedvalues]
probit_result.summary()

Optimization terminated successfully.
         Current function value: 0.469734
         Iterations 5


0,1,2,3
Dep. Variable:,survived,No. Observations:,1131.0
Model:,Probit,Df Residuals:,1127.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 06 Jun 2017",Pseudo R-squ.:,0.2838
Time:,13:25:39,Log-Likelihood:,-531.27
converged:,True,LL-Null:,-741.76
,,LLR p-value:,6.338e-91

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.2835,0.082,3.439,0.001,0.122,0.445
sex,-1.5190,0.091,-16.689,0.000,-1.697,-1.341
class1,0.9567,0.104,9.167,0.000,0.752,1.161
class2,0.2738,0.113,2.418,0.016,0.052,0.496


### Оценим линейную модель:
$P(survived=1) = \beta_1 + \beta_2\cdot sex + \beta_3\cdot class1+\beta_4\cdot class2$

In [336]:
lin_model = smf.ols(formula="survived ~ sex + class1 + class2", data=data)
lin_result = lin_model.fit()
lin_fitted_list = [1 if f > 0.5 else 0 for f in lin_result.fittedvalues]
lin_result.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.349
Model:,OLS,Adj. R-squared:,0.348
Method:,Least Squares,F-statistic:,201.8
Date:,"Tue, 06 Jun 2017",Prob (F-statistic):,9.590000000000001e-105
Time:,13:51:07,Log-Likelihood:,-534.45
No. Observations:,1131,AIC:,1077.0
Df Residuals:,1127,BIC:,1097.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6091,0.024,25.447,0.000,0.562,0.656
sex,-0.5091,0.025,-20.602,0.000,-0.558,-0.461
class1,0.2802,0.028,9.994,0.000,0.225,0.335
class2,0.0844,0.030,2.822,0.005,0.026,0.143

0,1,2,3
Omnibus:,64.857,Durbin-Watson:,1.679
Prob(Omnibus):,0.0,Jarque-Bera (JB):,74.91
Skew:,0.626,Prob(JB):,5.41e-17
Kurtosis:,3.151,Cond. No.,4.24


### Сравним модели логит, пробит и линейную:
Проверяем, все ли прогнозы трёх моделей одинаковые, и выводим первые 30 результатов

In [354]:

survived_list = list(data['survived'])
sex_list = list(data['sex'])
class1_list = list(data['class1'])
class2_list = list(data['class2'])

different = False
for i in range(100):
    if(0 < logit_fitted_list[i] + probit_fitted_list[i] + lin_fitted_list[i] < 3):
        different = True

print("Models offer different results:",different)
print("Fitted values:")
print("Sex\t\tClass1\t\tClass2\t\tSurvived\t\tLogit\t\tProbit\t\tLinear")
for i in range(60):
    print(sex_list[i], class1_list[i], class2_list[i],
          survived_list[i], logit_fitted_list[i], probit_fitted_list[i], lin_fitted_list[i], sep="\t\t")

Models offer different results: False
Fitted values:
Sex		Class1		Class2		Survived		Logit		Probit		Linear
1.0		0.0		0.0		0.0		0		0		0
1.0		1.0		0.0		1.0		0		0		0
1.0		1.0		0.0		1.0		0		0		0
1.0		1.0		0.0		1.0		0		0		0
1.0		0.0		0.0		1.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
1.0		1.0		0.0		0.0		0		0		0
0.0		1.0		0.0		1.0		1		1		1
0.0		1.0		0.0		1.0		1		1		1
1.0		0.0		0.0		0.0		0		0		0
1.0		1.0		0.0		0.0		0		0		0
1.0		0.0		0.0		1.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
0.0		1.0		0.0		1.0		1		1		1
1.0		0.0		0.0		0.0		0		0		0
1.0		1.0		0.0		0.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
1.0		0.0		1.0		0.0		0		0		0
1.0		0.0		1.0		0.0		0		0		0
0.0		1.0		0.0		1.0		1		1		1
0.0		0.0		1.0		1.0		1		1		1
0.0		1.0		0.0		1.0		1		1		1
1.0		1.0		0.0		0.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
0.0		1.0		0.0		1.0		1		1		1
1.0		0.0		1.0		0.0		0		0		0
1.0		0.0		0.0		0.0		0		0		0
0.0		0.0		1.0		1.0		1		1		1
1.0		1.0		0.0		0.0		0		0		0
0.0		1.0		0.0		1.0		1		1		1
0.0		0.0		1.0		1.0		1		1		

Итак, расхождений между прогнозами нет вообще. Проверим, сколько прогнозов неверны:

In [353]:
print(str(int(rss(logit_fitted_list,survived_list))) + " / " + str(n))

238 / 1131


Итак, из 1131 элемента выборки результаты 238 предсказаны неверно. Хороший результат.

In [224]:
def predict(class1, class2, class3, sex):
    return np.exp(logit_coefs[0] +
                  class1 * logit_coefs[1] +
                  class2 * logit_coefs[2] +
                  sex * logit_coefs[3])


temp = np.exp(logit_result.fittedvalues)
print('survived\t\tprediction')
i = 0
for i in range(len(lst)):
    print(lst[i], results_estimate[i], sep='\t\t')

survived		prediction
0.0		0
1.0		0
1.0		0
1.0		0
1.0		0
0.0		0
0.0		0
1.0		1
1.0		1
0.0		0
0.0		0
1.0		0
0.0		0
1.0		1
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
1.0		1
1.0		1
1.0		1
0.0		0
0.0		0
1.0		1
0.0		0
0.0		0
1.0		1
0.0		0
1.0		1
1.0		1
1.0		0
1.0		0
0.0		0
0.0		1
1.0		0
0.0		0
1.0		1
1.0		1
0.0		1
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
1.0		0
0.0		0
0.0		0
0.0		0
1.0		1
0.0		0
0.0		1
0.0		0
0.0		0
0.0		0
1.0		0
0.0		0
0.0		0
0.0		0
1.0		0
0.0		1
0.0		0
1.0		1
0.0		0
0.0		0
0.0		0
1.0		0
0.0		0
1.0		1
1.0		0
0.0		0
0.0		0
0.0		0
1.0		1
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
0.0		0
1.0		1
1.0		1
0.0		1
1.0		1
1.0		0
0.0		0
0.0		0
1.0		0
0.0		0
1.0		1
1.0		1
0.0		0
1.0		1
0.0		0
1.0		1
0.0		0
0.0		0
0.0		0
0.0		0
1.0		0
0.0		0
0.0		0
1.0		1
0.0		1
0.0		0
1.0		1
0.0		0
1.0		0
1.0		1
0.0		1
1.0		0
1.0		0
0.0		0
1.0		1
0.0		0
1.0		1
1.0		0
1.0		1
0.0		0
0.0		0
0.0		0
1.0		1
0.0		0
0.0		0
1.0		0
0.0		0
0.0		0
0.0		0
0.0		0
1.0		1
1.0		1
1.0		1
0.0		0
1.0		0
1.0		0
0.0		0