# Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import numpy as np

In [None]:
data_train = pd.read_csv('data/train.csv')
data_train.head()

In [None]:
y_train = data_train.pop('Survived')

In [None]:
data_test = pd.read_csv('data/test.csv')
data_test.head()

## Check data

In [None]:
data_test['test'] = True
data_train['test'] = False
data = pd.concat([data_train, data_test], axis=0)
data.isna().sum(axis=0)

In [None]:
data.info()

In [None]:
# graph age distribution
sns.displot(data['Age'].dropna(), kde=True, bins=20)
# add median and mean
plt.axvline(data['Age'].median(), color='red')
plt.axvline(data['Age'].mean(), color='green')
plt.show()

In [None]:
# fare distribution
sns.displot(data['Fare'].dropna(), kde=True, bins=20)
plt.axvline(data['Fare'].median(), color='red')
plt.axvline(data['Fare'].mean(), color='green')
plt.show()

In [None]:
# complete missing age with mean
data['Age'].fillna(data['Age'].mean(), inplace=True)

In [None]:
# Embarked C = Cherbourg, Q = Queenstown, S = Southampton
# C = 1, Q = 2, S = 3 and na = 0
data['Embarked'] = data['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})
data['Embarked'].fillna(0, inplace=True)
data['Embarked'].isna().sum(axis=0)

In [None]:
# ticket number
ticket = data['Ticket'].str.split(' ', expand=True)
ticket_number = ticket[1].where(~ticket[1].isna(), ticket[0])
ticket_number = ticket_number.replace('LINE', 0).replace('Basle', 0)
ticket_number = ticket_number.astype('float64')
data['Ticket number'] = ticket_number

In [None]:
# Sex
data['Sex'] = (data['Sex'] == 'male').astype(int)

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_train = data[~data['test']][features]
X_test = data[data['test']][features]
X_train.info()

In [None]:
# model with regularization
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(7,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])


In [None]:
# train 
model.fit(X_train, y_train, epochs=3000)

In [None]:
# graph loss
sns.lineplot(data=model.history.history['loss'][2000:])
plt.show()

In [None]:
# predict
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
y_pred.reshape(-1)

In [None]:
# save to csv
submission = pd.DataFrame({'PassengerId': data_test['PassengerId'], 'Survived': y_pred.reshape(-1)})
# File csv with 2 columns: PassengerId, Survived
submission.to_csv('data/submission.csv', index=False)
submission.head()