In [1]:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras import optimizers
from tensorflow.keras import layers,Model

In [2]:
# import data
trainData = pd.read_csv('train.csv',header = 0)
testData = pd.read_csv('test.csv',header = 0)




In [3]:
# Deal with missing values
def nan_padding(data, columns):
    for column in columns:
        imputer=SimpleImputer()
        data[column]=imputer.fit_transform(data[column].values.reshape(-1,1))
    return data

columns = ["Age", "SibSp", "Parch","Fare"]

trainData = nan_padding(trainData, columns)
testData = nan_padding(testData, columns)

trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [4]:
# Drop useless data
def DropColumn(data, columns):    
    return data.drop(columns,axis = 1)


columns = ['Name',"Ticket", "Cabin","Embarked"]
trainData = DropColumn(trainData,columns)
testData = DropColumn(testData,columns)

trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1.0,0.0,7.25
1,2,1,1,female,38.0,1.0,0.0,71.2833
2,3,1,3,female,26.0,0.0,0.0,7.925
3,4,1,1,female,35.0,1.0,0.0,53.1
4,5,0,3,male,35.0,0.0,0.0,8.05


In [5]:
def DummyData(data, columns):
    for c in columns:
        data = pd.concat([data,pd.get_dummies(data[c],prefix = c)],axis = 1)
        data = data.drop(c,axis = 1)
    return data

columns = ['Pclass']
trainData = DummyData(trainData,columns)
testData = DummyData(testData,columns)

trainData.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,1,0,male,22.0,1.0,0.0,7.25,0,0,1
1,2,1,female,38.0,1.0,0.0,71.2833,1,0,0
2,3,1,female,26.0,0.0,0.0,7.925,0,0,1
3,4,1,female,35.0,1.0,0.0,53.1,1,0,0
4,5,0,male,35.0,0.0,0.0,8.05,0,0,1


In [6]:
def ConvertSex(data):
    le = LabelEncoder()
    le.fit(['male','female'])
    data['Sex'] = le.transform(data['Sex'])
    return
ConvertSex(trainData)
ConvertSex(testData)

trainData.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,22.0,1.0,0.0,7.25,0,0,1
1,2,1,0,38.0,1.0,0.0,71.2833,1,0,0
2,3,1,0,26.0,0.0,0.0,7.925,0,0,1
3,4,1,0,35.0,1.0,0.0,53.1,1,0,0
4,5,0,1,35.0,0.0,0.0,8.05,0,0,1


In [7]:
def NormalizeData(data,columns):
    scaler = MinMaxScaler()
    for c in columns:
        data[c] = scaler.fit_transform(data[c].values.reshape(-1,1))
    return data


columns = ['Age','Fare']
trainData = NormalizeData(trainData,columns)
testData = NormalizeData(testData,columns)

trainData.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,0.271174,1.0,0.0,0.014151,0,0,1
1,2,1,0,0.472229,1.0,0.0,0.139136,1,0,0
2,3,1,0,0.321438,0.0,0.0,0.015469,0,0,1
3,4,1,0,0.434531,1.0,0.0,0.103644,1,0,0
4,5,0,1,0.434531,0.0,0.0,0.015713,0,0,1


In [8]:
# train data after processing
trainData.head()


Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,0.271174,1.0,0.0,0.014151,0,0,1
1,2,1,0,0.472229,1.0,0.0,0.139136,1,0,0
2,3,1,0,0.321438,0.0,0.0,0.015469,0,0,1
3,4,1,0,0.434531,1.0,0.0,0.103644,1,0,0
4,5,0,1,0.434531,0.0,0.0,0.015713,0,0,1


In [9]:
# test data after processing
testData.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,892,1,0.452723,0.0,0.0,0.015282,0,0,1
1,893,0,0.617566,1.0,0.0,0.013663,0,0,1
2,894,1,0.815377,0.0,0.0,0.018909,0,1,0
3,895,1,0.353818,0.0,0.0,0.016908,0,0,1
4,896,0,0.287881,1.0,1.0,0.023984,0,0,1


In [10]:
trainX = trainData.iloc[:,2:].to_numpy()
trainY = trainData.loc[:,'Survived'].to_numpy()
testX = testData.iloc[:,1:].to_numpy()
testY = pd.read_csv('gender_submission.csv',header = 0).iloc[:,1].to_numpy()

In [11]:
# Check point
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath = "./checkpoint.ckpt" , 
                                                 save_weights_only = True,
                                                 save_best_only = True,
                                                 peroid=5, 
                                                 verbose = 1)

In [12]:
def CreateModel():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units = 1024, 
                              activation = 'relu', input_shape=(8,),                               
                              kernel_regularizer = tf.keras.regularizers.l1(0.0001)
                             ),
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),        
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),        
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),        
        tf.keras.layers.Dense(units = 4096, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.Dense(units = 1024, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(0.00001)),                
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    return model

In [13]:
model = CreateModel()
model.compile(loss = 'binary_crossentropy',
             optimizer=tf.keras.optimizers.Adam(lr = 0.001),
             metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              9216      
_________________________________________________________________
dense_1 (Dense)              (None, 4096)              4198400   
_________________________________________________________________
dense_2 (Dense)              (None, 4096)              16781312  
_________________________________________________________________
dropout (Dropout)            (None, 4096)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 4096)              16781312  
_________________________________________________________________
dense_4 (Dense)              (None, 4096)              16781312  
_________________________________________________________________
dropout_1 (Dropout)          (None, 4096)              0

In [None]:
#model.load_weights('./checkpoint.ckpt')
model.fit(x = trainX,
          y = trainY,
          epochs = 2000,
          verbose = 2,
          validation_data = (testX,testY),
          callbacks=[cp_callback],
         )
          

Train on 891 samples, validate on 418 samples
Epoch 1/2000

Epoch 00001: val_loss improved from inf to 4.13107, saving model to ./checkpoint.ckpt
891/891 - 41s - loss: 8.1840 - accuracy: 0.5960 - val_loss: 4.1311 - val_accuracy: 0.9211
Epoch 2/2000

Epoch 00002: val_loss improved from 4.13107 to 2.34794, saving model to ./checkpoint.ckpt
891/891 - 46s - loss: 3.2598 - accuracy: 0.7497 - val_loss: 2.3479 - val_accuracy: 0.8780
Epoch 3/2000

Epoch 00003: val_loss improved from 2.34794 to 1.51666, saving model to ./checkpoint.ckpt
891/891 - 51s - loss: 2.0819 - accuracy: 0.7980 - val_loss: 1.5167 - val_accuracy: 0.9139
Epoch 4/2000

Epoch 00004: val_loss improved from 1.51666 to 1.20123, saving model to ./checkpoint.ckpt
891/891 - 58s - loss: 1.5826 - accuracy: 0.7868 - val_loss: 1.2012 - val_accuracy: 0.9306
Epoch 5/2000

Epoch 00005: val_loss improved from 1.20123 to 1.04717, saving model to ./checkpoint.ckpt
891/891 - 37s - loss: 1.3225 - accuracy: 0.7912 - val_loss: 1.0472 - val_accur

In [None]:
# evaluate
test_loss, test_acc = model.evaluate(testX,testY,verbose = 0)

print('test accuracy: {}, test loss: {}'.format(test_acc,test_loss))

In [None]:
# predictions
predictions = model.predict_classes(testX).reshape(len(testX))
# write csv file
submission =pd.DataFrame({'PassengerId':testData['PassengerId'],'Survived':predictions})
submission.to_csv('Titanic Predictions.csv',index = False)