In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
# Training and test data files
train_file = '../input/train.csv'
test_file = '../input/test.csv'
model_file = '../output/titanic.model.json'
model_weights_file = '../output/titanic.model.best.hdf5'
pred_file = '../output/gender_submission.csv'

In [6]:
# Load training data
df_train_raw = pd.read_csv(train_file)
print(df_train_raw.shape)
df_train_raw.info()
df_train_raw.head()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Prepare the data for training and testing
from sklearn.preprocessing import MinMaxScaler

max_name_len = df_train_raw.Name.map(len).max()
max_ticket_len = df_train_raw.Ticket.map(len).max()

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev', 'Dr', 'Ms', 'Mlle',
            'Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']

import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if substring in big_string:
            return substring
    return np.nan

def prep_data(frame, mode='test'):
    # Fill missing Age data with median 
    frame['Age'] = frame['Age'].fillna(frame['Age'].mean())
    
    # Generate data about whether adult or minor
    frame['Adult_Or_Minor'] = frame.apply(lambda row: 0 if row['Age'] < 18 else 1, axis=1)

    # Generate data about whether senior citizen
    frame['Senior_Citizen'] = frame.apply(lambda row: 0 if row['Age'] > 65 else 1, axis=1)

    # Fill missing Fare data with median
    frame['Fare'] = frame['Fare'].fillna(frame['Fare'].median())
    
    # Creating new family_size and fare per person columns 
    frame['Family_Size'] = frame['SibSp'] + frame['Parch'] + 1
    frame['Alone'] = frame.apply(lambda row: 1 if row['Family_Size'] == 1 else 0, axis=1)
    frame['Fare_Per_Person'] = frame['Fare']/frame['Family_Size']

    # Convert Sex to number
    frame['Sex'] = pd.Categorical(frame['Sex']).codes

    # Generate data for missing Embarked and convert to number
    frame['Embarked'] = frame['Embarked'].fillna('X')
    frame['Embarked'] = pd.Categorical(frame['Embarked']).codes
    
    # Extract title from name
    frame['Title'] = frame['Name'].map(lambda x: substrings_in_string(x, title_list))
    frame['Title'] = pd.Categorical(frame['Title']).codes

    # Convert Name into characters
    frame['Name_Length'] = frame.apply(lambda row: len(row['Name']), axis=1)
    frame['Words_In_Name'] = frame.apply(lambda row: len(row['Name'].split()), axis=1)    
#    for i in range(0, max_name_len):
#        col_name = 'Name' + str(i)
#        frame[col_name] = frame['Name'].str[i]
#        frame[col_name] = frame.apply(lambda row: 0 if isinstance(row[col_name], float) and math.isnan(row[col_name]) else ord(row[col_name]), axis=1)
    frame.pop('Name')    
    
    # Convert Ticket into characters
    frame['Ticket_Length'] = frame.apply(lambda row: len(row['Ticket']), axis=1)
#    for i in range(0, max_ticket_len):
#        col_name = 'Ticket' + str(i)
#        frame[col_name] = frame['Ticket'].str[i]
#        frame[col_name] = frame.apply(lambda row: 0 if isinstance(row[col_name], float) and math.isnan(row[col_name]) else ord(row[col_name]), axis=1)
    frame.pop('Ticket')    
    
    # Convert Cabin column to whether in cabin
    frame['Cabin'] = frame['Cabin'].fillna('')
    frame['In_Cabin'] = frame.apply(lambda row: 1 if row['Cabin'] != '' else 0, axis=1)
    frame['Number_Of_Cabins'] = frame.apply(lambda row: len(row['Cabin'].split()), axis=1)    
    frame.pop('Cabin')
    
    frame.fillna(0, axis=1)
    
    # Introduce rows with some noise
    if (mode == 'augment'):
        print('Adding more rows to training data')
        row_count = frame.shape[0]
        print('Row count before: ', row_count)
        col_std = np.std(frame) 
        for i in range(0, row_count):
            row1 = pd.Series(frame.iloc[i])
            row2 = pd.Series(frame.iloc[i])
            col_list = frame.columns.tolist()
            col_list.remove('PassengerId')
            col_list.remove('Survived')
            for col in frame.columns.tolist():
                row1[col] = row1[col] + np.random.random_sample() * col_std[col]
                row2[col] = row2[col] - np.random.random_sample() * col_std[col]
            if np.random.random_sample() < 0.33:
                frame = frame.append(row1)
            if np.random.random_sample() > 0.66:
                frame = frame.append(row2)
        row_count = frame.shape[0]
        print('Row count after: ', row_count)
    
    
    print("Before scaling: ")
    print(frame.head())
    
    # Scale everything except PassengerId
    min_max_scaler = MinMaxScaler()
    col_list = frame.columns.tolist()
    col_list.remove('PassengerId')
    frame = frame[col_list]
    np_scaled = min_max_scaler.fit_transform(frame)
    frame = pd.DataFrame(np_scaled)
    
    print("After scaling: ")
    print(frame.head())

    return frame



In [8]:
# Prep training data
df_train = prep_data(df_train_raw, mode='test')

# Construct the X array
X_train = np.array(df_train)[:,1:]
X_train = X_train.astype('float32')
print(X_train.shape)
print(X_train[0])

Before scaling: 
   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0            1         0       3    1  22.0      1      0   7.2500         2   
1            2         1       1    0  38.0      1      0  71.2833         0   
2            3         1       3    0  26.0      0      0   7.9250         2   
3            4         1       1    0  35.0      1      0  53.1000         2   
4            5         0       3    1  35.0      0      0   8.0500         2   

   Adult_Or_Minor  Senior_Citizen  Family_Size  Alone  Fare_Per_Person  Title  \
0               1               1            2      0          3.62500     11   
1               1               1            2      0         35.64165     12   
2               1               1            1      1          7.92500      8   
3               1               1            2      0         26.55000     12   
4               1               1            1      1          8.05000     11   

   Name_Length 

In [9]:
# Extract survived data as predictions
from keras.utils.np_utils import to_categorical
y_train = np.array(df_train)[:,0]
y_train = y_train.astype('int')
y_train = to_categorical(y_train, 2)
print(y_train.shape)
print(y_train[0:5])

Using TensorFlow backend.


(891, 2)
[[ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]]


In [10]:
# Load test data
df_test_raw = pd.read_csv(test_file)
print(df_test_raw.shape)
df_test_raw.head()
df_test_raw.info()

(418, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [11]:
# Prepare the data for testing
df_test = prep_data(df_test_raw)

# Construct the X array
X_test = np.array(df_test)[:,:]
X_test = X_test.astype('float32')
print(X_test.shape)
print(X_test[0])

Before scaling: 
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0          892       3    1  34.5      0      0   7.8292         1   
1          893       3    0  47.0      1      0   7.0000         2   
2          894       2    1  62.0      0      0   9.6875         1   
3          895       3    1  27.0      0      0   8.6625         2   
4          896       3    0  22.0      1      1  12.2875         2   

   Adult_Or_Minor  Senior_Citizen  Family_Size  Alone  Fare_Per_Person  Title  \
0               1               1            1      1         7.829200      5   
1               1               1            2      0         3.500000      6   
2               1               1            1      1         9.687500      5   
3               1               1            1      1         8.662500      5   
4               1               1            3      0         4.095833      6   

   Name_Length  Words_In_Name  Ticket_Length  In_Cabin  Number_Of_Cabins  


In [30]:
# Build a training network

from keras.models import Sequential
from keras.layers import Dense, Dropout, RepeatVector, Flatten, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import SGD
from keras.layers.advanced_activations import LeakyReLU

model = Sequential()
model.add(Dense(891, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.25))
model.add(Dense(445, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(222, activation='relu'))
model.add(Dropout(0.75))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 891)               16929     
_________________________________________________________________
dropout_7 (Dropout)          (None, 891)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 445)               396940    
_________________________________________________________________
dropout_8 (Dropout)          (None, 445)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 222)               99012     
_________________________________________________________________
dropout_9 (Dropout)          (None, 222)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 446       
Total para

In [31]:
# Save the model
model_json = model.to_json()
with open(model_file, 'w') as json_file:
    json_file.write(model_json)

In [32]:
# Train the model
checkpointer = ModelCheckpoint(filepath=model_weights_file, verbose=1, save_best_only=True)
stopper = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='auto')
hist = model.fit(X_train, y_train, epochs=200, batch_size=20, validation_split=0.3,
                 callbacks=[checkpointer, stopper], 
                 verbose=1, shuffle=True)

Train on 623 samples, validate on 268 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 00030: early stopping


In [33]:
# Load the weights that yielded the best validation accuracy
model.load_weights(model_weights_file)

# Evaluate the model on the training set
score = model.evaluate(X_train, y_train)
print("\nTraining Accuracy:", score[1])

Training Accuracy: 0.844556678559


In [34]:
# Predict for test data
y_test = model.predict(X_test)
print(y_test[0])

[ 0.90940678  0.09485527]


In [35]:
# Save predictions
with open(pred_file, 'w') as f:
    f.write('PassengerId,Survived\n')
    for index, y_hat in enumerate(y_test):
        prediction = np.argmax(y_hat)
        f.write(str(int(df_test_raw.iloc[index]['PassengerId'])) + ',' + str(prediction)+'\n')
    f.close()

In [37]:
# Use random forest classification 

from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

clf = RandomForestClassifier(n_estimators=20000, warm_start=True, n_jobs=-1, random_state=0, verbose=1)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train)
scores.mean()     


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed:   

[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    3.4s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    4.0s
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:    4.6s
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:    5.2s
[Parallel(n_jobs=8)]: Done 14434 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done 16184 tasks      | elapsed:    6.6s
[Parallel(n_jobs=8)]: Done 18034 tasks      | elapsed:    7.4s
[Parallel(n_jobs=8)]: Done 19984 tasks      | elapsed:    8.1s
[P

0.81705948372615034

In [38]:
y_test = clf.predict(X_test)
print(y_test[0])

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    2.6s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    4.2s
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:    5.6s
[Parallel(

[ 1.  0.]
