In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')

In [3]:
feature_column_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [4]:
features = df[feature_column_names]

In [5]:
features['Age'].fillna(features['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [6]:
features['Embarked'].fillna('S', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
labels = df[['Survived']]

In [8]:
train_features, eval_features, train_labels, eval_labels = train_test_split(
    features, labels, test_size=0.33)

In [9]:
batch_size = 10

In [10]:
def train_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(1000))
    return dataset.batch(batch_size)

In [11]:
def eval_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    return dataset.batch(batch_size)

In [12]:
def predict_input_fn(features):
    dataset = tf.data.Dataset.from_tensor_slices(features)
    return dataset.batch(batch_size)

In [13]:
def train_fn(logits, loss, labels, predicted_classes):
    accuracy = tf.metrics.accuracy(labels, predicted_classes, name='acc_op')
    tf.summary.scalar('accuracy', accuracy[1])
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss, tf.train.get_or_create_global_step())
    return tf.estimator.EstimatorSpec(
        tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)

In [14]:
def eval_fn(logits, loss, labels, predicted_classes):
    accuracy = tf.metrics.accuracy(labels, predicted_classes, name='acc_op')
    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])
    return tf.estimator.EstimatorSpec(
        tf.estimator.ModeKeys.EVAL, loss=loss, eval_metric_ops=metrics)

In [15]:
def predict_fn(logits, predicted_classes):
    return tf.estimator.EstimatorSpec(
        tf.estimator.ModeKeys.PREDICT, predictions=predicted_classes)

In [16]:
def model_fn(features, labels, mode, params):
    training = mode == tf.estimator.ModeKeys.PREDICT
    dropout_rate = 0.7 if training else 0.0
    
    X = tf.feature_column.input_layer(features, params['feature_columns'])
    X = tf.keras.layers.Dense(30, activation=tf.nn.relu)(X)
    X = tf.keras.layers.BatchNormalization()(X)
    X = tf.keras.layers.Dropout(dropout_rate)(X)
    X = tf.keras.layers.Dense(20, activation=tf.nn.relu)(X)
    X = tf.keras.layers.BatchNormalization()(X)
    X = tf.keras.layers.Dropout(dropout_rate)(X)
    X = tf.keras.layers.Dense(10, activation=tf.nn.relu)(X)
    X = tf.keras.layers.BatchNormalization()(X)
    logits = tf.keras.layers.Dense(units=params['n_classes'])(X)
    predicted_classes = tf.argmax(logits, 1)
    
    if training:
        return predict_fn(logits, predicted_classes)
    
    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        return train_fn(logits, loss, labels, predicted_classes)
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return eval_fn(logits, loss, labels, predicted_classes)

In [17]:
feature_columns = [
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list('Pclass', [1, 2, 3])
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list('Sex', ['male', 'female'])
    ),
    tf.feature_column.numeric_column('Age'),
    tf.feature_column.numeric_column('SibSp'),
    tf.feature_column.numeric_column('Parch'),
    tf.feature_column.numeric_column('Fare'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list('Embarked', ['S', 'C', 'Q'])
    ),
]

In [18]:
classifier = tf.estimator.Estimator(
    model_fn, 
    model_dir='model',
    params={
        'feature_columns': feature_columns, 
        'n_classes': 10
    })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f209685c3c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
train_feature_dict = {k:train_features[k].as_matrix() for k in feature_column_names}

In [20]:
eval_feature_dict = {k:eval_features[k].as_matrix() for k in feature_column_names}

In [21]:
for _ in range(10):
    classifier.train(
        lambda: train_input_fn(train_feature_dict, train_labels), 
        steps=1000)
    
    classifier.evaluate(lambda: eval_input_fn(eval_feature_dict, eval_labels))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into model/model.ckpt.
INFO:tensorflow:loss = 6.5885835, step = 0
INFO:tensorflow:global_step/sec: 102.535
INFO:tensorflow:loss = 0.85103244, step = 100 (0.978 sec)
INFO:tensorflow:global_step/sec: 176.099
INFO:tensorflow:loss = 0.767949, step = 200 (0.566 sec)
INFO:tensorflow:global_step/sec: 176.211
INFO:tensorflow:loss = 0.6878705, step = 300 (0.567 sec)
INFO:tensorflow:global_step/sec: 177.181
INFO:tensorflow:loss = 0.8640439, step = 400 (0.565 sec)
INFO:tensorflow:global_step/sec: 175.728
INFO:tensorflow:loss = 0.4544819, step = 500 (0.569 sec)
INFO:tensorflow:global_step/sec: 175.322
INFO:tensorflow:loss = 0.43083864, step = 600 (0.571 sec)
INFO:tensorflow:global_step/sec: 172.446
INFO:tensorflow:loss = 0.3

In [22]:
predict_df = pd.read_csv('test.csv')

In [23]:
predict_features = predict_df[feature_column_names]

In [24]:
predict_features['Age'].fillna(features['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [25]:
predict_features['Fare'].fillna(features['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [26]:
predict_feature_dict = {k:predict_features[k].as_matrix() for k in feature_column_names}

In [None]:
predictions = list(classifier.predict(lambda: predict_input_fn(predict_feature_dict)))

INFO:tensorflow:Calling model_fn.


In [None]:
offset = 892
submissions = pd.DataFrame({'PassengerId': range(offset, len(predictions) + offset), 'Survived': predictions})

In [None]:
submissions.to_csv('submissions.csv', index=False)