In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score

### Load the data

In [17]:
data = pd.read_csv('titanic.csv.bz2')
data
#X = pd.concat([data['Pclass'], data['Sex']], axis=1)
Y = data['survived']

In [18]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Estimate a LPM Model

In [20]:
mod = smf.ols(formula='survived ~ C(pclass) + sex', data=data)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               survived   R-squared:                       0.341
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     225.5
Date:                Wed, 11 Apr 2018   Prob (F-statistic):          7.79e-118
Time:                        19:34:14   Log-Likelihood:                -639.22
No. Observations:                1309   AIC:                             1286.
Df Residuals:                    1305   BIC:                             1307.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.8990      0.025     35.

### Use the model to predict the survival

In [21]:
ypred = res.predict(data)
ypred

0       0.898984
1       0.394113
2       0.898984
3       0.394113
4       0.898984
5       0.394113
6       0.898984
7       0.394113
8       0.898984
9       0.394113
10      0.394113
11      0.898984
12      0.898984
13      0.898984
14      0.394113
15      0.394113
16      0.394113
17      0.898984
18      0.898984
19      0.394113
20      0.394113
21      0.898984
22      0.394113
23      0.898984
24      0.898984
25      0.394113
26      0.394113
27      0.898984
28      0.898984
29      0.394113
          ...   
1279    0.606349
1280    0.101478
1281    0.101478
1282    0.101478
1283    0.101478
1284    0.101478
1285    0.101478
1286    0.606349
1287    0.101478
1288    0.101478
1289    0.101478
1290    0.606349
1291    0.101478
1292    0.101478
1293    0.101478
1294    0.101478
1295    0.101478
1296    0.101478
1297    0.101478
1298    0.101478
1299    0.101478
1300    0.606349
1301    0.101478
1302    0.101478
1303    0.101478
1304    0.606349
1305    0.606349
1306    0.1014

### Analyze your predicted survival values. Find maximum and minimum. Is any value equal to zero or 1?

In [22]:
ypred.max()

0.8989839322447712

In [23]:
ypred.min()

0.10147811470359103

None of the values are equal to 0 or 1. The maximum value is 0.834 and the minimum value is 0.089

### Re-compute the survival

In [24]:
for i in range(0,len(ypred)):
    if ypred[i] >= 0.5:
        ypred[i] = 1
    else:
        ypred[i] = 0

### Accuracy Score

In [25]:
accuracy_score(Y,ypred)*100

77.99847211611917

### Estimate a similar model but now allowing class effect to differ between men and women

In [28]:
mod_interaction = smf.ols(formula='survived ~ C(pclass)*sex', data=data)
res_new = mod_interaction.fit()
print(res_new.summary())

                            OLS Regression Results                            
Dep. Variable:               survived   R-squared:                       0.370
Model:                            OLS   Adj. R-squared:                  0.368
Method:                 Least Squares   F-statistic:                     153.2
Date:                Wed, 11 Apr 2018   Prob (F-statistic):          4.02e-128
Time:                        19:35:06   Log-Likelihood:                -609.86
No. Observations:                1309   AIC:                             1232.
Df Residuals:                    1303   BIC:                             1263.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept           

### Interpret the coefficients

Here, when we consider interaction between the passenger class and the gender, we can see a big difference in the survivial rate. The highest survival rate is for females from 1st class. Females from third class have a very low survival rate in comparison from first class.

Also, the R-squared value has improvaed marginally, indicating that the model understands a little bit better when considering the interactions.

### Calculate the accuracy

In [29]:
ypred_interaction = res_new.predict()

In [30]:
for i in range(0,len(ypred_interaction)):
    if ypred_interaction[i] >= 0.5:
        ypred_interaction[i] = 1
    else:
        ypred_interaction[i] = 0

ypred_interaction

array([1., 0., 1., ..., 0., 0., 0.])

In [31]:
accuracy_score(Y,ypred_interaction)*100

78.30404889228419

The accuracy has improved by 1% but more importantly, the model is able to generalize better.