In [94]:
from __future__ import print_function
import tensorflow as tf
import numpy as np

In [105]:
csv_columns = ['Survived', 'Sex']
record_defaults = [[0], ['']]
num_examples = {
    'train': 700,
    'test': 891-700
}

run_config = tf.estimator.RunConfig().replace(
    session_config=tf.ConfigProto(device_count={'GPU': 0}))

def build_model_columns():
    sex = tf.feature_column.categorical_column_with_vocabulary_list(
        'Sex', ['male', 'female'])
    return [sex]

def build_estimator():
    feature_columns = build_model_columns()

    return tf.estimator.LinearClassifier(
        model_dir='/tmp/titanic/',
        feature_columns=feature_columns,
        config=run_config
    )

def input_fn(data_file, num_epochs, shuffle, batch_size):

    def parse_csv(value):
        print('Parsing', data_file)
        columns = tf.decode_csv(value, record_defaults=record_defaults)
        features = dict(zip(csv_columns, columns))
        labels = features.pop('Survived')
        return features, tf.equal(labels, 1)

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=num_examples['train'])

    dataset = dataset.map(parse_csv, num_parallel_calls=5)

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)

    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

print("loading, " + str(file_length) + " line(s)\n")

# Main "model" training
train_epochs = 40
epochs_per_eval = 2
batch_size = 40
train_data = 'input/train-sex-training-set.csv'
test_data = 'input/train-sex-dev-set.csv'

model = build_estimator()

for n in range(train_epochs // epochs_per_eval):
    model.train(input_fn=lambda: input_fn(
        train_data, epochs_per_eval, True, batch_size))

    results = model.evaluate(input_fn=lambda: input_fn(
        test_data, 1, False, batch_size))

    # display results

    print('Results at epoch', (n + 1) * epochs_per_eval)
    print('-' * 60)

    for key in sorted(results):
        print('%s: %s' % (key, results[key]))

loading, 892 line(s)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/titanic/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x116d06390>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Parsing input/train-sex-training-set.csv
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-701
INFO:tensorflow:Saving checkpoints for 702 into /tmp/titanic/model.ckpt.
INFO:tensorflow:loss = 22.0576, step = 702
INFO:tensorflow:Saving checkpoints for 736 into /tmp/titanic/model.ckpt.
INFO:tensorflow:Loss for final step: 17.3644.
Parsing input/train-sex-dev

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-911
INFO:tensorflow:Saving checkpoints for 912 into /tmp/titanic/model.ckpt.
INFO:tensorflow:loss = 17.875, step = 912
INFO:tensorflow:Saving checkpoints for 946 into /tmp/titanic/model.ckpt.
INFO:tensorflow:Loss for final step: 23.7287.
Parsing input/train-sex-dev-set.csv
INFO:tensorflow:Starting evaluation at 2018-01-20-14:12:58
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-946
INFO:tensorflow:Finished evaluation at 2018-01-20-14:12:59
INFO:tensorflow:Saving dict for global step 946: accuracy = 0.789474, accuracy_baseline = 0.631579, auc = 0.761905, auc_precision_recall = 0.762697, average_loss = 0.51145, global_step = 946, label/mean = 0.368421, loss = 19.4351, prediction/mean = 0.368329
Results at epoch 14
------------------------------------------------------------
accuracy: 0.789474
accuracy_baseline: 0.631579
auc: 0.761905
auc_precision_recall: 0.7626

INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1156
INFO:tensorflow:Finished evaluation at 2018-01-20-14:13:34
INFO:tensorflow:Saving dict for global step 1156: accuracy = 0.789474, accuracy_baseline = 0.631579, auc = 0.761905, auc_precision_recall = 0.762697, average_loss = 0.511461, global_step = 1156, label/mean = 0.368421, loss = 19.4355, prediction/mean = 0.37085
Results at epoch 26
------------------------------------------------------------
accuracy: 0.789474
accuracy_baseline: 0.631579
auc: 0.761905
auc_precision_recall: 0.762697
average_loss: 0.511461
global_step: 1156
label/mean: 0.368421
loss: 19.4355
prediction/mean: 0.37085
Parsing input/train-sex-training-set.csv
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1156
INFO:tensorflow:Saving checkpoints for 1157 into /tmp/titanic/model.ckpt.
INFO:tensorflow:loss = 20.2436, step = 1157
INFO:tensorflow:Saving checkpoints for 1191 into /tmp/titanic/

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1366
INFO:tensorflow:Saving checkpoints for 1367 into /tmp/titanic/model.ckpt.
INFO:tensorflow:loss = 19.9462, step = 1367
INFO:tensorflow:Saving checkpoints for 1401 into /tmp/titanic/model.ckpt.
INFO:tensorflow:Loss for final step: 18.7616.
Parsing input/train-sex-dev-set.csv
INFO:tensorflow:Starting evaluation at 2018-01-20-14:14:16
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1401
INFO:tensorflow:Finished evaluation at 2018-01-20-14:14:17
INFO:tensorflow:Saving dict for global step 1401: accuracy = 0.789474, accuracy_baseline = 0.631579, auc = 0.761905, auc_precision_recall = 0.762697, average_loss = 0.511446, global_step = 1401, label/mean = 0.368421, loss = 19.435, prediction/mean = 0.369268
Results at epoch 40
------------------------------------------------------------
accuracy: 0.789474
accuracy_baseline: 0.631579
auc: 0.761905
auc_precision_recall

In [108]:
# model prediction using Pandas and 'tf.estimator.inputs.numpy_input_fn'
import pandas as pd

test_df = pd.read_csv('input/test.csv')
pall_pdf = np.array([test_df['Sex']])

print(pall_pdf.T.shape)

predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"Sex": pall_pdf.T},
    num_epochs=1,
    shuffle=False)

predictions = model.predict(input_fn=predict_input_fn)
predicted_classes = [p["classes"][0].decode('utf8') for p in predictions]
sum((int(x) for x in predicted_classes))

submission = pd.DataFrame(data={'PassengerId': test_df['PassengerId'], 'Survived': predicted_classes})
submission.to_csv('input/submission.csv', index=False)
submission.tail()

(418, 1)
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1401


Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [109]:
# model.predict with TF
def tf_predict_input_fn(data_file):

    def parse_csv(value):
        print('Parsing', data_file)
        columns = tf.decode_csv(value, record_defaults=[['']])
        features = dict(zip(['Sex'], columns))
        return features

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    dataset = dataset.map(parse_csv, num_parallel_calls=5)
    
    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(1) # times to repeat
    dataset = dataset.batch(1) # batch size - NOTE: probably ignored since "repeat=1"

    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    return features

predictions = model.predict(input_fn=lambda: tf_predict_input_fn(
        data_file='input/test-sex.csv'))
predicted_classes = [p["classes"][0].decode('utf8') for p in predictions]
submission = pd.DataFrame(data={'PassengerId': test_df['PassengerId'], 'Survived': predicted_classes})
submission.to_csv('input/submission-tf.csv', index=False)
submission.tail()

Parsing input/test-sex.csv
INFO:tensorflow:Restoring parameters from /tmp/titanic/model.ckpt-1401


Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [107]:
# trim down main file to 'Sex' column only
import csv

read_filename = 'input/train-orig.csv'
write_filename = 'input/train-sex.csv'

with open(read_filename, 'r', newline='') as csv_readfile:
    reader = csv.DictReader(csv_readfile)

    with open(write_filename, 'w', newline='') as csv_writefile:
        fieldnames = ['Survived', 'Sex']
        writer = csv.DictWriter(csv_writefile, fieldnames=fieldnames)

        writer.writeheader()
        for line in reader:
            writer.writerow({'Survived': line['Survived'], 'Sex': line['Sex']})

In [70]:
# split 'train-sex.csv' into 'train' and 'dev'
import csv

read_filename = 'input/train-sex.csv'
write_filename = 'input/train-sex-dev-set.csv'

split = 700

with open(read_filename, 'r', newline='') as csv_readfile:
    reader = csv.DictReader(csv_readfile)

    with open(write_filename, 'w', newline='') as csv_writefile:
        fieldnames = ['Survived', 'Sex']
        writer = csv.DictWriter(csv_writefile, fieldnames=fieldnames)

        writer.writeheader()
        for idx, line in enumerate(reader):
            if idx > split:
                writer.writerow({'Survived': line['Survived'], 'Sex': line['Sex']})

In [74]:
# test set: remove all columns except for 'Sex'
# No 'Survived' label like the training data
import csv

read_filename = 'input/test-orig.csv'
write_filename = 'input/test-sex.csv'

with open(read_filename, 'r', newline='') as csv_readfile:
    reader = csv.DictReader(csv_readfile)

    with open(write_filename, 'w', newline='') as csv_writefile:
        fieldnames = ['Sex']
        writer = csv.DictWriter(csv_writefile, fieldnames=fieldnames)

        writer.writeheader()
        for line in reader:
            writer.writerow({'Sex': line['Sex']})

In [32]:
features

{'Age': <tf.Tensor 'DecodeCSV_25:5' shape=() dtype=float32>,
 'Cabin': <tf.Tensor 'DecodeCSV_25:10' shape=() dtype=string>,
 'Embarked': <tf.Tensor 'DecodeCSV_25:11' shape=() dtype=string>,
 'Fare': <tf.Tensor 'DecodeCSV_25:9' shape=() dtype=float32>,
 'Name': <tf.Tensor 'DecodeCSV_25:3' shape=() dtype=string>,
 'Parch': <tf.Tensor 'DecodeCSV_25:7' shape=() dtype=int32>,
 'PassengerId': <tf.Tensor 'DecodeCSV_25:0' shape=() dtype=int32>,
 'Pclass': <tf.Tensor 'DecodeCSV_25:2' shape=() dtype=int32>,
 'Sex': <tf.Tensor 'DecodeCSV_25:4' shape=() dtype=string>,
 'SibSp': <tf.Tensor 'DecodeCSV_25:6' shape=() dtype=int32>,
 'Survived': <tf.Tensor 'DecodeCSV_25:1' shape=() dtype=int32>,
 'Ticket': <tf.Tensor 'DecodeCSV_25:8' shape=() dtype=string>}

In [34]:
try:
    age = features.pop('Age')
except KeyError:
    # already poped
    pass

age

KeyError: 'Age'

In [37]:
survived = features.pop('Survived')
survived

<tf.Tensor 'DecodeCSV_25:1' shape=() dtype=int32>

In [44]:
import shutil

feature_columns = [age]
hidden_units = [100, 75, 50, 25]
model_dir = '/tmp/titanic/'
train_epochs = 10
epochs_per_eval = 2
batch_size = 50
train_data = 'input/train.csv'
test_data = 'input/test.csv'
num_examples = {
    'train': 750,
    'test': 891-750
}

# CSV file and FeatureColumn setup

filename = "input/train.csv"
csv_columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex',
               'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
               'Cabin', 'Embarked']
record_defaults = [[0], [0], [0], [''], [''],
                   [0.], [0], [0], [''], [0.],
                   [''], ['']]

# Model Config start

run_config = tf.estimator.RunConfig().replace(
  session_config=tf.ConfigProto(device_count={'GPU': 0}))

shutil.rmtree(model_dir, ignore_errors=True)

model = tf.estimator.LinearClassifier(
    model_dir=model_dir,
    feature_columns=feature_columns,
    config=run_config
)

def input_fn(data_file, num_epochs, shuffle, batch_size):

    def parse_csv(value):
        # setup text reader
        file_length = file_len(filename)
        filename_queue = tf.train.string_input_producer([filename])
        reader = tf.TextLineReader(skip_header_lines=1)
        _, csv_row = reader.read(filename_queue)

        # setup CSV decoding
        passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked = tf.decode_csv(csv_row, record_defaults=record_defaults)

        features = dict(zip(
            csv_columns,
            [passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked]))
        age_dict = features.pop('Age')
        return [age_dict], tf.equal(survived, 1)

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=num_examples['train'])

    dataset = dataset.map(parse_csv, num_parallel_calls=5)

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)

    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

# main training run

for n in range(train_epochs // epochs_per_eval):
    model.train(input_fn=lambda: input_fn(
        train_data, epochs_per_eval, True, batch_size))

    results = model.evaluate(input_fn=lambda: input_fn(
        test_data, 1, False, batch_size))

    # Display evaluation metrics
    print('Results at epoch', (n + 1) * epochs_per_eval)
    print('-' * 60)

for key in sorted(results):
    print('%s: %s' % (key, results[key]))

INFO:tensorflow:Using config: {'_model_dir': '/tmp/titanic/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x114afe6a0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.framework.ops.Tensor'>

In [45]:
age_dict

NameError: name 'age_dict' is not defined