# Model: Logistic Regression with Age

https://www.kaggle.com/c/titanic/overview

The model__logreg did not include age as a feature.  This model will include age.  Age with NaN in the entry will be replaced with the mean age of all passengers. 

**Initialization**

In [1]:
%run init.ipynb

In [2]:
from data.data import ExtractData
from models import predict_model as pm
from zeetle.data import eda

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import scale

RANDOM_STATE = 42

## Extract Clean Data

**Separate data into X (features) and y (label)**

In [3]:
data = ExtractData('../data/raw/train.csv', drop_columns=['cabin', 'name', 'ticket'])
Xy = data.Xy

Xy.age = Xy.age.fillna(value=Xy.age.mean())

In [24]:
NUMERICAL_COLUMNS = ['age', 'sibsp', 'parch', 'fare']

X_numerical = pd.DataFrame(Xy[NUMERICAL_COLUMNS], index=Xy.index, columns=NUMERICAL_COLUMNS )
X_numerical

X_cat_encoded =  pd.get_dummies(Xy[['pclass', 'sex', 'embarked']], drop_first=True)
X_cat_encoded

Unnamed: 0_level_0,age,sibsp,parch,fare
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.0,1,0,7.2500
2,38.0,1,0,71.2833
3,26.0,0,0,7.9250
4,35.0,1,0,53.1000
5,35.0,0,0,8.0500
...,...,...,...,...
886,39.0,0,5,29.1250
887,27.0,0,0,13.0000
888,19.0,0,0,30.0000
890,26.0,0,0,30.0000


Unnamed: 0_level_0,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,1,1,0,1
2,0,0,0,0,0
3,0,1,0,0,1
4,0,0,0,0,1
5,0,1,1,0,1
...,...,...,...,...,...
886,0,1,0,1,0
887,1,0,1,0,1
888,0,0,0,0,1
890,0,0,1,0,0


**Verify that age has no NaN**

In [25]:
X_numerical[X_numerical.age.isna()]

Unnamed: 0_level_0,age,sibsp,parch,fare
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


## Train Test Split Data

In [26]:
X = X_numerical.join(X_cat_encoded)
X.shape
X

y = Xy['survived']
y.shape

(712, 9)

Unnamed: 0_level_0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,22.0,1,0,7.2500,0,1,1,0,1
2,38.0,1,0,71.2833,0,0,0,0,0
3,26.0,0,0,7.9250,0,1,0,0,1
4,35.0,1,0,53.1000,0,0,0,0,1
5,35.0,0,0,8.0500,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,39.0,0,5,29.1250,0,1,0,1,0
887,27.0,0,0,13.0000,1,0,1,0,1
888,19.0,0,0,30.0000,0,0,0,0,1
890,26.0,0,0,30.0000,0,0,1,0,0


(712,)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

X_train

y_test = y_test.to_frame()
y_test

Unnamed: 0_level_0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
473,33.0,1,2,27.7500,1,0,0,0,1
433,42.0,1,0,26.0000,1,0,0,0,1
667,25.0,0,0,13.0000,1,0,1,0,1
31,40.0,0,0,27.7208,0,0,1,0,0
292,19.0,1,0,91.0792,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
94,26.0,1,2,20.5750,0,1,1,0,1
136,23.0,0,0,15.0458,1,0,1,0,0
339,45.0,0,0,8.0500,0,1,1,0,1
550,8.0,1,1,36.7500,1,0,1,0,1


Unnamed: 0_level_0,survived
passengerid,Unnamed: 1_level_1
642,1
497,1
263,0
312,1
552,0
...,...
363,0
57,1
138,0
652,1


In [28]:
print(f'Number of sample in training data = {len(X_train)}')
print(f'Number of sample in test data = {len(X_test)}')

Number of sample in training data = 569
Number of sample in test data = 143


### Logistic Regression with Age

In [29]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train) 

y_pred = pd.Series(logreg.predict(X_test), 
                   index=y_test.index, name='survived_pred').to_frame()
y_pred

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Unnamed: 0_level_0,survived_pred
passengerid,Unnamed: 1_level_1
642,1
497,1
263,0
312,1
552,0
...,...
363,0
57,1
138,0
652,1


## Calculate Metrics

In [30]:
Xy_test = pm.concat_to_create_xy_test(X_test, y_test, y_pred)
Xy_test

Unnamed: 0_level_0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S,survived,survived_pred,is_prediction_correct
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
642,24.0,0,0,69.3000,0,0,0,0,0,1,1,True
497,54.0,1,0,78.2667,0,0,0,0,0,1,1,True
263,52.0,1,1,79.6500,0,0,1,0,1,0,0,True
312,18.0,2,2,262.3750,0,0,0,0,0,1,1,True
552,27.0,0,0,26.0000,1,0,1,0,1,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
363,45.0,0,1,14.4542,0,1,0,0,0,0,0,True
57,21.0,0,0,10.5000,1,0,0,0,1,1,1,True
138,37.0,1,0,53.1000,0,0,1,0,1,0,0,True
652,18.0,0,1,23.0000,1,0,0,0,1,1,1,True


In [31]:
metrics = pm.calc_metrics(Xy_test)

metrics

{'log_loss': 6.762876477199374, 'accuracy': 0.8041958041958042}

# Drill Down

In [58]:
Xy2 = Xy.join(Xy_test[['survived_pred', 'is_prediction_correct']], how='right')

In [66]:
Xy2.groupby(['sex'])[['survived','survived_pred']].mean()

Unnamed: 0_level_0,survived,survived_pred
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.764706,0.843137
male,0.26087,0.065217
