In [17]:
import os

import numpy as np
import pandas as pd
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
data_pd = pd.read_csv('Datasets/train.csv').set_index('PassengerId')

## Feature engineering

In [41]:
# On-hot encode Embarked column
enriched_pd = pd.get_dummies(data=data_pd, columns=['Embarked'])
enriched_pd.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1,0,0


In [42]:
# Transform Sex values
enriched_pd['Female'] = np.where(enriched_pd.Sex == 'female', True, False)
enriched_pd = enriched_pd.drop(columns=['Name', 'Sex', 'Cabin', 'Ticket'])

In [43]:
enriched_pd.head(2)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,22.0,1,0,7.25,0,0,1,False
2,1,1,38.0,1,0,71.2833,1,0,0,True


In [50]:
# Age: replace missing values with average
enriched_pd.loc[enriched_pd.Age.isna(), 'Age'] = enriched_pd.Age.mean()

## Split

In [51]:
X = enriched_pd.drop(columns='Survived')
y = enriched_pd.Survived

np.random.seed(16)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.tail(2)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
239,2,19.0,0,0,10.5,0,0,1,False
682,1,27.0,0,0,76.7292,1,0,0,False


In [52]:
X_train.count()

Pclass        668
Age           668
SibSp         668
Parch         668
Fare          668
Embarked_C    668
Embarked_Q    668
Embarked_S    668
Female        668
dtype: int64

## Model building and training

In [53]:
scaler = StandardScaler()
lr = LogisticRegression()
model1 = Pipeline([('standardize', scaler),
                   ('log_reg', lr)])

In [54]:
model1.fit(X_train, y_train)

## Train Score

In [56]:
y_train_hat = model1.predict(X_train)
y_train_hat_probs = model1.predict_proba(X_train)[:,1]

train_accuracy = accuracy_score(y_train, y_train_hat)*100

print('Confusion matrix:\n', confusion_matrix(y_train, y_train_hat))
print('Training accuracy: %.4f %%' % train_accuracy)

Confusion matrix:
 [[356  56]
 [ 69 187]]
Training accuracy: 81.2874 %


## Test Score

In [57]:
y_test_hat = model1.predict(X_test)
y_test_hat_probs = model1.predict_proba(X_test)[:,1]

test_accuracy = accuracy_score(y_test, y_test_hat)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))
print('Testing accuracy: %.4f %%' % test_accuracy) 

Confusion matrix:
 [[117  20]
 [ 32  54]]
Testing accuracy: 76.6816 %
