In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [3]:
raw_train = pd.read_csv('train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('test.csv', index_col=0)
raw_test['is_test'] = 1

In [4]:
all_data = pd.concat((raw_train, raw_test), axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [6]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,38.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,35.0,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [7]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [8]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [9]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)


Epoch 1/600
 - 2s - loss: 568.2781
Epoch 2/600
 - 0s - loss: 235.8107
Epoch 3/600
 - 0s - loss: 213.5132
Epoch 4/600
 - 0s - loss: 204.1251
Epoch 5/600
 - 1s - loss: 185.6258
Epoch 6/600
 - 1s - loss: 194.1145
Epoch 7/600
 - 2s - loss: 186.5223
Epoch 8/600
 - 2s - loss: 168.0812
Epoch 9/600
 - 1s - loss: 171.7361
Epoch 10/600
 - 1s - loss: 155.6820
Epoch 11/600
 - 1s - loss: 165.6599
Epoch 12/600
 - 1s - loss: 152.8507
Epoch 13/600
 - 1s - loss: 156.3790
Epoch 14/600
 - 1s - loss: 154.0864
Epoch 15/600
 - 1s - loss: 140.9160
Epoch 16/600
 - 1s - loss: 133.3453
Epoch 17/600
 - 0s - loss: 142.7430
Epoch 18/600
 - 1s - loss: 135.9816
Epoch 19/600
 - 1s - loss: 144.4927
Epoch 20/600
 - 1s - loss: 137.3486
Epoch 21/600
 - 1s - loss: 126.4894
Epoch 22/600
 - 1s - loss: 136.3465
Epoch 23/600
 - 1s - loss: 128.1002
Epoch 24/600
 - 1s - loss: 132.5195
Epoch 25/600
 - 1s - loss: 124.1691
Epoch 26/600
 - 1s - loss: 126.4720
Epoch 27/600
 - 1s - loss: 115.4360
Epoch 28/600
 - 1s - loss: 111.1704


Epoch 224/600
 - 1s - loss: 83.2689
Epoch 225/600
 - 0s - loss: 88.2336
Epoch 226/600
 - 1s - loss: 82.6407
Epoch 227/600
 - 0s - loss: 85.3312
Epoch 228/600
 - 1s - loss: 86.5864
Epoch 229/600
 - 1s - loss: 83.4139
Epoch 230/600
 - 1s - loss: 86.0643
Epoch 231/600
 - 3s - loss: 85.1531
Epoch 232/600
 - 1s - loss: 83.3016
Epoch 233/600
 - 0s - loss: 86.1528
Epoch 234/600
 - 1s - loss: 88.3547
Epoch 235/600
 - 0s - loss: 80.2471
Epoch 236/600
 - 1s - loss: 81.5757
Epoch 237/600
 - 1s - loss: 83.3612
Epoch 238/600
 - 1s - loss: 81.5050
Epoch 239/600
 - 0s - loss: 86.5939
Epoch 240/600
 - 1s - loss: 83.2669
Epoch 241/600
 - 1s - loss: 83.0204
Epoch 242/600
 - 1s - loss: 79.7536
Epoch 243/600
 - 1s - loss: 80.3263
Epoch 244/600
 - 1s - loss: 81.6802
Epoch 245/600
 - 0s - loss: 81.5568
Epoch 246/600
 - 1s - loss: 84.9860
Epoch 247/600
 - 1s - loss: 84.6270
Epoch 248/600
 - 0s - loss: 82.6294
Epoch 249/600
 - 1s - loss: 79.9602
Epoch 250/600
 - 1s - loss: 82.2223
Epoch 251/600
 - 1s - loss: 

Epoch 452/600
 - 1s - loss: 77.0222
Epoch 453/600
 - 1s - loss: 81.4792
Epoch 454/600
 - 1s - loss: 78.1525
Epoch 455/600
 - 1s - loss: 77.4527
Epoch 456/600
 - 1s - loss: 74.3392
Epoch 457/600
 - 1s - loss: 78.1357
Epoch 458/600
 - 1s - loss: 81.2961
Epoch 459/600
 - 1s - loss: 81.7705
Epoch 460/600
 - 1s - loss: 77.2464
Epoch 461/600
 - 1s - loss: 78.9675
Epoch 462/600
 - 1s - loss: 77.7214
Epoch 463/600
 - 1s - loss: 76.9936
Epoch 464/600
 - 0s - loss: 77.7491
Epoch 465/600
 - 1s - loss: 78.2809
Epoch 466/600
 - 1s - loss: 78.8008
Epoch 467/600
 - 3s - loss: 76.3582
Epoch 468/600
 - 1s - loss: 83.5625
Epoch 469/600
 - 1s - loss: 77.9144
Epoch 470/600
 - 1s - loss: 77.8452
Epoch 471/600
 - 1s - loss: 79.6806
Epoch 472/600
 - 0s - loss: 78.8716
Epoch 473/600
 - 1s - loss: 76.0918
Epoch 474/600
 - 1s - loss: 77.6881
Epoch 475/600
 - 1s - loss: 75.3245
Epoch 476/600
 - 1s - loss: 77.7703
Epoch 477/600
 - 1s - loss: 76.3042
Epoch 478/600
 - 1s - loss: 79.4046
Epoch 479/600
 - 1s - loss: 

<keras.callbacks.callbacks.History at 0x18c6a8b1648>

In [10]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
864,,2,8,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
869,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
879,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [11]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


ValueError: shape mismatch: value array of shape (177,1) could not be broadcast to indexing result of shape (177,)