In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [3]:
raw_train = pd.read_csv('../data/train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('../data/test.csv', index_col=0)
raw_test['is_test'] = 1

In [4]:
all_data = pd.concat((raw_train, raw_test), axis=0)
print(all_data)

              Age        Cabin Embarked      Fare  \
PassengerId                                         
1            22.0          NaN        S    7.2500   
2            38.0          C85        C   71.2833   
3            26.0          NaN        S    7.9250   
4            35.0         C123        S   53.1000   
5            35.0          NaN        S    8.0500   
6             NaN          NaN        Q    8.4583   
7            54.0          E46        S   51.8625   
8             2.0          NaN        S   21.0750   
9            27.0          NaN        S   11.1333   
10           14.0          NaN        C   30.0708   
11            4.0           G6        S   16.7000   
12           58.0         C103        S   26.5500   
13           20.0          NaN        S    8.0500   
14           39.0          NaN        S   31.2750   
15           14.0          NaN        S    7.8542   
16           55.0          NaN        S   16.0000   
17            2.0          NaN        Q   29.1

## Functions to preprocess the data (前処理関数)

In [5]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [6]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,38.0,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26.0,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,35.0,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,35.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Build Network to predict missing ages (ロストデータの予測)

In [8]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
print(for_age_train)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']
print(y)

              Age  Parch  SibSp  Title_Capt  Title_Col  Title_Don  Title_Dona  \
PassengerId                                                                     
1            22.0      0      1         0.0        0.0        0.0         0.0   
2            38.0      0      1         0.0        0.0        0.0         0.0   
3            26.0      0      0         0.0        0.0        0.0         0.0   
4            35.0      0      1         0.0        0.0        0.0         0.0   
5            35.0      0      0         0.0        0.0        0.0         0.0   
7            54.0      0      0         0.0        0.0        0.0         0.0   
8             2.0      1      3         0.0        0.0        0.0         0.0   
9            27.0      2      0         0.0        0.0        0.0         0.0   
10           14.0      0      1         0.0        0.0        0.0         0.0   
11            4.0      1      1         0.0        0.0        0.0         0.0   
12           58.0      0    

NameError: name 'y' is not defined

In [115]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [120]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=1000, verbose=2)

Epoch 1/1000
 - 0s - loss: 76.7022
Epoch 2/1000
 - 0s - loss: 74.7966
Epoch 3/1000
 - 0s - loss: 74.4673
Epoch 4/1000
 - 0s - loss: 76.2056
Epoch 5/1000
 - 0s - loss: 77.2281
Epoch 6/1000
 - 0s - loss: 78.2255
Epoch 7/1000
 - 0s - loss: 78.1388
Epoch 8/1000
 - 0s - loss: 76.2785
Epoch 9/1000
 - 0s - loss: 78.4057
Epoch 10/1000
 - 0s - loss: 78.6172
Epoch 11/1000
 - 0s - loss: 79.8023
Epoch 12/1000
 - 0s - loss: 75.9993
Epoch 13/1000
 - 0s - loss: 74.0410
Epoch 14/1000
 - 0s - loss: 77.1173
Epoch 15/1000
 - 0s - loss: 75.9874
Epoch 16/1000
 - 0s - loss: 78.2534
Epoch 17/1000
 - 0s - loss: 78.0136
Epoch 18/1000
 - 0s - loss: 76.3918
Epoch 19/1000
 - 0s - loss: 75.6511
Epoch 20/1000
 - 0s - loss: 76.4611
Epoch 21/1000
 - 0s - loss: 76.2073
Epoch 22/1000
 - 0s - loss: 71.7707
Epoch 23/1000
 - 0s - loss: 77.3007
Epoch 24/1000
 - 0s - loss: 75.8693
Epoch 25/1000
 - 0s - loss: 79.4173
Epoch 26/1000
 - 0s - loss: 76.0775
Epoch 27/1000
 - 0s - loss: 77.7281
Epoch 28/1000
 - 0s - loss: 76.4548
E

<keras.callbacks.History at 0x7f0abe218668>

In [128]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18,,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
27,,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
29,,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32,,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
37,,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43,,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
      ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
p = p.reshape(177,)


NameError: name 'train_data' is not defined

In [10]:
to_pred

NameError: name 'to_pred' is not defined

In [None]:
train_data['Age'].loc[train_data['Age'].isnull()] = p
train_data['Age']=np.round(train_data['Age'])

In [130]:

train_data['Age']=np.round(train_data['Age'])
train_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,38.0,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26.0,0,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,35.0,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,35.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,32.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,54.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,2.0,1,3,0.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,27.0,2,0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10,14.0,0,1,1.0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [131]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values).reshape(86,)
test_data['Age'].loc[test_data['Age'].isnull()] = np.round(p)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [132]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
5,1.0,0.0


In [133]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

## create model

In [134]:
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [136]:
model.fit(X.values, y.values, epochs=1000, verbose=2)

Epoch 1/1000
 - 0s - loss: 0.3096 - acc: 0.8878
Epoch 2/1000
 - 0s - loss: 0.2976 - acc: 0.8934
Epoch 3/1000
 - 0s - loss: 0.3113 - acc: 0.8878
Epoch 4/1000
 - 0s - loss: 0.2892 - acc: 0.8979
Epoch 5/1000
 - 0s - loss: 0.2895 - acc: 0.8956
Epoch 6/1000
 - 0s - loss: 0.2913 - acc: 0.8956
Epoch 7/1000
 - 0s - loss: 0.2901 - acc: 0.9012
Epoch 8/1000
 - 0s - loss: 0.3068 - acc: 0.8844
Epoch 9/1000
 - 0s - loss: 0.3002 - acc: 0.8990
Epoch 10/1000
 - 0s - loss: 0.3953 - acc: 0.8440
Epoch 11/1000
 - 0s - loss: 0.3335 - acc: 0.8765
Epoch 12/1000
 - 0s - loss: 0.3224 - acc: 0.8777
Epoch 13/1000
 - 0s - loss: 0.3207 - acc: 0.8822
Epoch 14/1000
 - 0s - loss: 0.3309 - acc: 0.8923
Epoch 15/1000
 - 0s - loss: 0.3318 - acc: 0.8844
Epoch 16/1000
 - 0s - loss: 0.3167 - acc: 0.8855
Epoch 17/1000
 - 0s - loss: 0.3076 - acc: 0.8923
Epoch 18/1000
 - 0s - loss: 0.3060 - acc: 0.8866
Epoch 19/1000
 - 0s - loss: 0.2992 - acc: 0.8878
Epoch 20/1000
 - 0s - loss: 0.2913 - acc: 0.8967
Epoch 21/1000
 - 0s - loss: 0

<keras.callbacks.History at 0x7f0abe26f2e8>

In [137]:
test_data.columns

Index(['Age', 'Parch', 'SibSp', 'Survived', 'is_test', 'Title_Capt',
       'Title_Col', 'Title_Don', 'Title_Dona', 'Title_Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [138]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

In [139]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [140]:
submission.shape

(418, 2)

In [141]:
submission.to_csv('titanic_keras_cs.csv', index=False)

In [142]:
pd.read_csv("./titanic_keras_cs.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
