In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [6]:
# cleanup data
def cleanup_data(data):
    # remove Nan
    data = data[np.isfinite(data['Age'])]
    return data

train_data_raw = pd.read_csv("datasets/titanic/train.csv")
test_data_raw = pd.read_csv("datasets/titanic/test.csv")

# let's split the training set into a validation set
train_data = train_data_raw.sample(frac=0.8, replace=False)
validation_data = train_data_raw[len(train_data):]
test_data = test_data_raw

print("Null values Train:")
print(train_data.isnull().sum())
print("---------------------")
print("Null values Test:")
print(test_data.isnull().sum())
print("---------------------")
print("Null values Validation:")
print(validation_data.isnull().sum())


train_data = cleanup_data(train_data)
test_data = test_data
validation_data = cleanup_data(validation_data)

Null values Train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            138
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          543
Embarked         2
dtype: int64
---------------------
Null values Test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
---------------------
Null values Validation:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             30
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          137
Embarked         1
dtype: int64


In [7]:
# feature columns we'll use to train
feature_columns = [
    tf.feature_column.numeric_column("Age"),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list("Sex", vocabulary_list=['Female', 'Male'])
    ),
    tf.feature_column.numeric_column("Fare")
]

# build linear classifier
estimator = tf.estimator.DNNClassifier(feature_columns=feature_columns, hidden_units=[128, 64])
# estimator = tf.estimator.LinearClassifier(feature_columns=feature_columns)
estimator

# build train input function
input_fn_train = tf.estimator.inputs.pandas_input_fn(
    x = pd.DataFrame({
        "Age" : train_data.Age.values,
        "Sex" : train_data.Sex.values,
        "Fare" : train_data.Fare.values
    }),
    y = pd.Series(train_data.Survived.values),
    shuffle=False
)

# train
estimator.train(input_fn=input_fn_train, steps=20000)

input_fn_validation = tf.estimator.inputs.pandas_input_fn(
    x = pd.DataFrame({
        "Age" : validation_data.Age.values,
        "Sex" : validation_data.Sex.values,
        "Fare" : validation_data.Fare.values
    }),
    y = pd.Series(validation_data.Survived.values),
    shuffle=False
)

# evaluate
estimator.evaluate(input_fn=input_fn_validation)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/55/qkrngbr949x1jzs0c0dt70wc0000gn/T/tmp1v2j8cyd', '_tf_random_seed': None, '_global_id_in_cluster': 0, '_master': '', '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_log_step_count_steps': 100, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12340a080>, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_task_id': 0, '_is_chief': True, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_evaluation_master': '', '_session_config': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/55/qkrngbr949x1jzs0c0dt70wc000

{'accuracy': 0.6756757,
 'accuracy_baseline': 0.6081081,
 'auc': 0.66101533,
 'auc_precision_recall': 0.6298954,
 'average_loss': 0.6596345,
 'global_step': 5,
 'label/mean': 0.3918919,
 'loss': 48.81295,
 'prediction/mean': 0.2709247}

In [8]:
# predict
# build test input function to predict survival outcomes
input_fn_test = tf.estimator.inputs.pandas_input_fn(
    x = pd.DataFrame({
        "Age" : test_data.Age.values,
        "Sex" : test_data.Sex.values,
        "Fare" : test_data.Fare.values
    }),
    y = None,
    shuffle=False
)
predictions = list(estimator.predict(input_fn=input_fn_test))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/55/qkrngbr949x1jzs0c0dt70wc0000gn/T/tmp1v2j8cyd/model.ckpt-5
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [9]:
idx = 0

suvivals = []
for prediction in predictions:
    suvivals.append([test_data.iloc[idx].PassengerId, prediction['class_ids'][0]])
    idx+=1

submission = pd.DataFrame(columns=["PassengerId", "Survived"], data=suvivals)

In [11]:
submission.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [12]:
submission.to_csv("titanic-submission.csv", index=False)