# titanic

https://www.kaggle.com/competitions/titanic


## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

import os
from pathlib import Path

%matplotlib inline

In [554]:
random_seed = 42

comp_name = 'titanic'
is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if is_kaggle: 
    path = Path('../input/' + comp_name)
else:
    path = Path('./data')
    if not path.exists():
        import zipfile, kaggle
        kaggle.api.competition_download_cli(comp_name)
        zipfile.ZipFile(f'{comp_name}.zip').extractall(path)

In [555]:
train_data = pd.read_csv(path/'train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [556]:
test_data = pd.read_csv(path/"test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Data Exploration

https://www.kaggle.com/competitions/titanic/data

In [557]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [558]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [559]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [560]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [561]:
print("% of men that survived:", 100 * np.mean(train_data['Survived'][train_data['Sex'] == 'male']))
print("% of women that survived:", 100 * np.mean(train_data['Survived'][train_data['Sex'] == 'female']))

% of men that survived: 18.890814558058924
% of women that survived: 74.20382165605095


In [562]:
print('% of first class that survived', 100 * np.mean(train_data['Survived'][train_data['Pclass'] == 1]))
print('% of third class that survived', 100 * np.mean(train_data['Survived'][train_data['Pclass'] == 3]))

% of first class that survived 62.96296296296296
% of third class that survived 24.236252545824847


In [563]:
print('% of children that survived', 100 * np.mean(train_data['Survived'][train_data['Age'] < 18]))
print('% of adults that survived', 100 * np.mean(train_data['Survived'][train_data['Age'] >= 18]))

% of children that survived 53.98230088495575
% of adults that survived 38.10316139767055


## Features / Cleaning

In [564]:
train_data['FamilyCount'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilyCount'] = test_data['SibSp'] + test_data['Parch']

In [565]:
features = ['Pclass', 'Sex', 'FamilyCount']

## Split Data

In [566]:
X = pd.get_dummies(train_data[features])
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=random_seed)

## Train Model

In [567]:
def print_model_results(y_preds, y):
    print('Accuracy:', accuracy_score(y_preds, y) * 100)
    print('F1:', f1_score(y_preds, y) * 100)
    
    conf_mat = confusion_matrix(y_preds, y)
    print('Confusion Matrix:\n', conf_mat)

In [568]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=random_seed)
model.fit(X_train, y_train)
y_val_preds = model.predict(X_val)

print_model_results(y_val_preds, y_val)

Accuracy: 80.44692737430168
F1: 74.82014388489208
Confusion Matrix:
 [[92 22]
 [13 52]]


In [569]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=random_seed)
model.fit(X_train, y_train)
y_val_preds = model.predict(X_val)

print_model_results(y_val_preds, y_val)

Accuracy: 78.77094972067039
F1: 72.85714285714285
Confusion Matrix:
 [[90 23]
 [15 51]]


In [570]:
from sklearn.svm import SVC

model = SVC(random_state=random_seed)
model.fit(X_train, y_train)
y_val_preds = model.predict(X_val)

print_model_results(y_val_preds, y_val)

Accuracy: 79.88826815642457
F1: 73.91304347826087
Confusion Matrix:
 [[92 23]
 [13 51]]


In [571]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_val_preds = model.predict(X_val)

print_model_results(y_val_preds, y_val)

Accuracy: 79.88826815642457
F1: 72.72727272727273
Confusion Matrix:
 [[95 26]
 [10 48]]


In [572]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, random_state=random_seed)
model.fit(X_train, y_train)
y_val_preds = model.predict(X_val)

print_model_results(y_val_preds, y_val)

Accuracy: 80.44692737430168
F1: 74.82014388489208
Confusion Matrix:
 [[92 22]
 [13 52]]


## Submission

In [573]:
model = GradientBoostingClassifier()
model.fit(X, y)

In [574]:
X_test = pd.get_dummies(test_data[features])
y_test_preds = model.predict(X_test)

In [575]:
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_test_preds})
output.to_csv('submission.csv', index=False)