### Feature engineering

##### Changes to previous
- Missing values of Age replaced with max().

### Predictions
- One missing value in column "Fare". Replaced with mean. Might be worth doing value imputation.

In [1]:
import os

import numpy as np
import pandas as pd
import pandas_profiling
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_pd = pd.read_csv('Datasets/train.csv').set_index('PassengerId')

In [3]:
pred_pd = pd.read_csv('Datasets/test.csv')

## Feature engineering

In [4]:
# On-hot encode Embarked column
enriched_pd = pd.get_dummies(data=data_pd, columns=['Embarked'])

# pred
pred_pd = pd.get_dummies(data=pred_pd, columns=['Embarked'])
enriched_pd.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1,0,0


In [5]:
# Transform Sex values
enriched_pd['Female'] = np.where(enriched_pd.Sex == 'female', True, False)
enriched_pd = enriched_pd.drop(columns=['Name', 'Sex', 'Cabin', 'Ticket'])

# pred
pred_pd['Female'] = np.where(pred_pd.Sex == 'female', True, False)
pred_pd = pred_pd.drop(columns=['Name', 'Sex', 'Cabin', 'Ticket'])

In [6]:
enriched_pd.head(2)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,22.0,1,0,7.25,0,0,1,False
2,1,1,38.0,1,0,71.2833,1,0,0,True


In [7]:
# Age: replace missing values with max
enriched_pd.loc[enriched_pd.Age.isna(), 'Age'] = enriched_pd.Age.max()

# pred
pred_pd.loc[pred_pd.Age.isna(), 'Age'] = enriched_pd.Age.max()

In [8]:
# Fare: replace missing values with average
pred_pd.loc[pred_pd.Fare.isna(), 'Fare'] = pred_pd.Fare.mean()

## Split

In [9]:
X = enriched_pd.drop(columns='Survived')
y = enriched_pd.Survived

np.random.seed(16)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.tail(2)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
239,2,19.0,0,0,10.5,0,0,1,False
682,1,27.0,0,0,76.7292,1,0,0,False


In [10]:
X_train.count()

Pclass        668
Age           668
SibSp         668
Parch         668
Fare          668
Embarked_C    668
Embarked_Q    668
Embarked_S    668
Female        668
dtype: int64

## Model building and training

In [11]:
scaler = StandardScaler()
lr = LogisticRegression()
model1 = Pipeline([('standardize', scaler),
                   ('log_reg', lr)])

In [12]:
model1.fit(X_train, y_train)

## Train Score

In [13]:
y_train_hat = model1.predict(X_train)
y_train_hat_probs = model1.predict_proba(X_train)[:,1]

train_accuracy = accuracy_score(y_train, y_train_hat)*100

print('Confusion matrix:\n', confusion_matrix(y_train, y_train_hat))
print('Training accuracy: %.4f %%' % train_accuracy)

Confusion matrix:
 [[356  56]
 [ 72 184]]
Training accuracy: 80.8383 %


## Test Score

In [14]:
y_test_hat = model1.predict(X_test)
y_test_hat_probs = model1.predict_proba(X_test)[:,1]

test_accuracy = accuracy_score(y_test, y_test_hat)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))
print('Testing accuracy: %.4f %%' % test_accuracy) 

Confusion matrix:
 [[119  18]
 [ 31  55]]
Testing accuracy: 78.0269 %


## Cross-Validation

In [31]:
scores = cross_validate(model1, X, y, cv=5, scoring=['accuracy'],
                        return_train_score=True)

train_accuracy = scores['train_accuracy']
print(f'Avg train accuracy: {np.mean(train_accuracy)}')
print(f'Train accuracy: {train_accuracy}')

test_accuracy = scores['test_accuracy']
print(f'Avg test accuracy: {np.mean(test_accuracy)}')
print(f'Test accuracy: {test_accuracy}')

Avg train accuracy: 0.8055557306522527
Train accuracy: [0.80617978 0.80785414 0.80925666 0.80224404 0.80224404]
Avg test accuracy: 0.793528340970435
Test accuracy: [0.75977654 0.79775281 0.79213483 0.79775281 0.82022472]


## Submission file

In [15]:
X_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Female'],
      dtype='object')

In [16]:
pred_pd.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Female'],
      dtype='object')

In [17]:
pred_pd.isna().sum() / pred_pd.shape[0]

PassengerId    0.0
Pclass         0.0
Age            0.0
SibSp          0.0
Parch          0.0
Fare           0.0
Embarked_C     0.0
Embarked_Q     0.0
Embarked_S     0.0
Female         0.0
dtype: float64

In [18]:
# predict
prediction = model1.predict(X=pred_pd.set_index('PassengerId'))
pred_pd['Survived'] = prediction
pred_pd.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Female,Survived
0,892,3,34.5,0,0,7.8292,0,1,0,False,0
1,893,3,47.0,1,0,7.0,0,0,1,True,0
2,894,2,62.0,0,0,9.6875,0,1,0,False,0
3,895,3,27.0,0,0,8.6625,0,0,1,False,0
4,896,3,22.0,1,1,12.2875,0,0,1,True,1


In [19]:
pred_pd.shape

(418, 11)

In [20]:
# save to csv
pred_pd[['PassengerId', 'Survived']].to_csv('Datasets/Output/submission_01.csv.gz',
                                            index=False, compression='gzip')