In [1]:
import keras
import pandas as pd
import numpy as np
from keras.layers import *
from keras.models import Model
from sklearn import preprocessing

Using TensorFlow backend.


In [2]:
#Load Data
train_data = pd.read_csv('data/train.csv')
eval_data = pd.read_csv('data/test.csv')
print(train_data.info())
print("-"*10)
print(eval_data.info())
print(train_data.head())
#save pId's for eval data
pid = eval_data['PassengerId']
#train_data = train_data.dropna(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           41

In [3]:
# Check number of null and non null in data
print('Train columns with null values:\n', train_data.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', train_data.isnull().sum())
print("-"*10)


Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------


In [4]:
#function to preprocess Data
def preprocess_data(dataset):
    #replace null value of age by median of all non-null age in dataset
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #replace null value of stations by mode of the data (since it is categorical data)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    
    #Convert categorical data into int
    dataset['Sex'] = pd.Categorical(dataset.Sex).codes
    dataset['Embarked'] = pd.Categorical(dataset.Embarked).codes
    
    
    #Add additional features to data
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['isAlone'] = 1
    dataset['isAlone'].loc[dataset['FamilySize'] > 1] = 0
    
    dataset['FareBin'] = pd.qcut(dataset['Fare'].astype(int), 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    #Convert range into numerical
    label = preprocessing.LabelEncoder()
    
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    
    #Drop cols Name, PassengerID, and Cabin since they are most likely not responsible in survival
    drop_cols = ['Name', 'PassengerId', 'Cabin', 'Ticket', 'FareBin', 'AgeBin']
    dataset.drop(drop_cols, axis=1, inplace=True)
    
    return dataset

In [5]:
train_data = preprocess_data(train_data)
eval_data = preprocess_data(eval_data)

# Again check number of null values in data
print('Train columns with null values:\n', train_data.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', train_data.isnull().sum())
print("-"*10)

Train columns with null values:
 Survived        0
Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
FamilySize      0
isAlone         0
AgeBin_Code     0
FareBin_Code    0
dtype: int64
----------
Test/Validation columns with null values:
 Survived        0
Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
FamilySize      0
isAlone         0
AgeBin_Code     0
FareBin_Code    0
dtype: int64
----------


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,AgeBin_Code,FareBin_Code
0,0,3,1,22.0,1,0,7.25,2,2,0,1,0
1,1,1,0,38.0,1,0,71.2833,0,2,0,2,3
2,1,3,0,26.0,0,0,7.925,2,1,1,1,0
3,1,1,0,35.0,1,0,53.1,2,2,0,2,3
4,0,3,1,35.0,0,0,8.05,2,1,1,2,1


In [7]:
train_data.shape

(891, 12)

In [8]:
X_train, Y_train= train_data.loc[:, 'Pclass':], train_data['Survived']

In [9]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,AgeBin_Code,FareBin_Code
0,3,1,22.0,1,0,7.25,2,2,0,1,0
1,1,0,38.0,1,0,71.2833,0,2,0,2,3
2,3,0,26.0,0,0,7.925,2,1,1,1,0
3,1,0,35.0,1,0,53.1,2,2,0,2,3
4,3,1,35.0,0,0,8.05,2,1,1,2,1


In [10]:
Y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [11]:
X_train, Y_train = X_train.values, Y_train.values
#data_Normalization
#X_train=(X_train-X_train.mean())/X_train.std()

In [12]:
def createModel():
    inp = Input(X_train.shape[1:])
    x = Dense(128, kernel_initializer='normal', activation='relu')(inp)
    #x = Dropout(0.5)(x)
    x = Dense(256, kernel_initializer='normal', activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(256, kernel_initializer='normal', activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(256, kernel_initializer='normal', activation='relu')(x)
    #x = Dropout(0.5)(x)
    #     x = Dense(8, kernel_initializer='normal', activation='relu')(x)
    #     x = Dropout(0.5)(x)
    #     x = Dense(4, kernel_initializer='normal', activation='relu')(x)
    #     x = Dropout(0.5)(x)
    x = Dense(1, activation = 'sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    return model

In [13]:
model = createModel()

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
model.fit(x=X_train, y=Y_train, validation_split=0.2, epochs=150, batch_size=64)

Train on 712 samples, validate on 179 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150


Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150


Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.callbacks.History at 0x7fd4a3554940>

In [17]:
model.save('./trained_models/titanic_NN_v3.h5')

In [18]:
# test_data = pd.read_csv('data/test.csv')
eval_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,AgeBin_Code,FareBin_Code
0,3,1,34.5,0,0,7.8292,1,1,1,2,0
1,3,0,47.0,1,0,7.0,2,2,0,3,0
2,2,1,62.0,0,0,9.6875,1,1,1,4,1
3,3,1,27.0,0,0,8.6625,2,1,1,1,1
4,3,0,22.0,1,1,12.2875,2,3,0,1,1


In [19]:
#eval_data=(eval_data-eval_data.mean())/eval_data.std()
eval_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,AgeBin_Code,FareBin_Code
0,3,1,34.5,0,0,7.8292,1,1,1,2,0
1,3,0,47.0,1,0,7.0,2,2,0,3,0
2,2,1,62.0,0,0,9.6875,1,1,1,4,1
3,3,1,27.0,0,0,8.6625,2,1,1,1,1
4,3,0,22.0,1,1,12.2875,2,3,0,1,1


In [20]:
labels = model.predict(eval_data)

In [21]:
labels

array([[0.11496806],
       [0.19668719],
       [0.10073446],
       [0.11894929],
       [0.78135973],
       [0.21293887],
       [0.8051061 ],
       [0.07454533],
       [0.6800922 ],
       [0.04022956],
       [0.10883601],
       [0.3255888 ],
       [0.99917054],
       [0.12080293],
       [0.99951005],
       [0.9756064 ],
       [0.170874  ],
       [0.19517438],
       [0.28184322],
       [0.6779627 ],
       [0.34070545],
       [0.26834813],
       [0.99822253],
       [0.35192648],
       [0.50358987],
       [0.06105202],
       [0.9974606 ],
       [0.18957663],
       [0.36086625],
       [0.25194088],
       [0.14635254],
       [0.05404917],
       [0.7642036 ],
       [0.5498986 ],
       [0.37266043],
       [0.20871536],
       [0.35409504],
       [0.41293994],
       [0.10342573],
       [0.3901578 ],
       [0.06139066],
       [0.37109077],
       [0.11671633],
       [0.906446  ],
       [0.99922097],
       [0.09566654],
       [0.32500288],
       [0.122

In [22]:
labels = labels > 0.50

In [23]:
labels = labels*1

In [24]:
labels

array([[0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
    

In [25]:
df = pd.DataFrame(labels)

In [26]:
df.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1


In [27]:
df['PassengerId'] = pid

In [28]:
df.columns = ['Survived', 'PassengerId']

In [29]:
df.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,1,896


In [30]:
df.to_csv('data/mysub_v3.csv', index=False)