<a href="https://www.kaggle.com/code/yeemeitsang/titanic-keras?scriptVersionId=128745166" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Introduction**

This notebook is a walkthrough for creating a simple Keras sequential model on a handful of features to predict titanic survivors.

It outputs a CSV file that can be submitted straight to the competition.

In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers
import sklearn
from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic-train/train.csv
/kaggle/input/titanic-test/test.csv


**Data preprocessing**

In [2]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df['Sex'] = df.Sex.replace(['male', 'female'], [0, 1])
df = df.dropna(subset = ['Sex', 'Age', 'Pclass', 'Fare'])
df.shape

(714, 12)

In [5]:
fare = df.Fare.values.reshape(-1, 1)
fare_scaled = MinMaxScaler().fit_transform(fare)
print(fare_scaled.shape)

(714, 1)


In [6]:
df['Fare'] = pd.DataFrame(fare_scaled)
df['Fare'] = df.Fare.fillna(value = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    int64  
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Cabin        185 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 72.5+ KB


In [7]:
X_train = df[['Sex', 'Age', 'Pclass', 'Fare']]
X_train.shape

(714, 4)

In [8]:
X_train = X_train.astype('float64')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     714 non-null    float64
 1   Age     714 non-null    float64
 2   Pclass  714 non-null    float64
 3   Fare    714 non-null    float64
dtypes: float64(4)
memory usage: 27.9 KB


In [9]:
y_train = df['Survived']
y_train.describe()

count    714.000000
mean       0.406162
std        0.491460
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

In [10]:
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [12]:
df_test['Sex'] = df_test.Sex.replace(['male', 'female'], [0, 1])

In [13]:
fare = df_test.Fare.values.reshape(-1, 1)
fare_scaled = MinMaxScaler().fit_transform(fare)
df_test['Fare'] = pd.DataFrame(fare_scaled)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.0+ KB


In [14]:
df_test['Fare'] = df_test.Fare.fillna(value = 0)
df_test['Age'] = df_test.Age.fillna(method = 'pad')

In [15]:
X_test = df_test[['Sex', 'Age', 'Pclass', 'Fare']].astype('float64')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     418 non-null    float64
 1   Age     418 non-null    float64
 2   Pclass  418 non-null    float64
 3   Fare    418 non-null    float64
dtypes: float64(4)
memory usage: 13.2 KB


In [16]:
X_test.head()

Unnamed: 0,Sex,Age,Pclass,Fare
0,0.0,34.5,3.0,0.015282
1,1.0,47.0,3.0,0.013663
2,0.0,62.0,2.0,0.018909
3,0.0,27.0,3.0,0.016908
4,1.0,22.0,3.0,0.023984


In [17]:
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
y_train = tf.convert_to_tensor(y_train)
print(type(X_test))

<class 'tensorflow.python.framework.ops.EagerTensor'>


**Build and train model**

In [18]:
model = keras.Sequential([
    layers.Dense(128, activation = 'relu', input_shape = (4, )),
    layers.Dense(64, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dense(32, activation = 'relu'),
    layers.Dense(16, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dense(1, activation = 'sigmoid')
])

In [19]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = 'acc')
es = keras.callbacks.EarlyStopping(monitor = 'val_acc', patience = 100, restore_best_weights = True)
model.fit(X_train, y_train, batch_size = 100, epochs = 400, validation_split = 0.4, callbacks = [es])

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x79fd7e4c70d0>

In [20]:
#optional: save model for future use
model.save('titanic.h5')

In [21]:
# from keras.models import load_model
# model = load_model('titanic.h5')

**Predict and prepare submission file**

In [22]:
prob = model.predict(X_test)
prob[:5]

array([[0.0218583 ],
       [0.15801018],
       [0.00302312],
       [0.03619918],
       [0.45575178]], dtype=float32)

In [23]:
#set threshold
pred = [0 if x<0.5 else 1 for x in prob]
pred[:5]

[0, 0, 0, 0, 0]

In [24]:
pred = pd.DataFrame(pred, columns = ['Survived'])
sub = pd.concat([df_test.PassengerId, pred], axis=1)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
