In [2]:
# tf.estimator
# 1. Age only
# 2. Age in buckets

%matplotlib inline
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [37]:
_CSV_COLUMNS = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex',
                'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
                'Cabin', 'Embarked']

_CSV_COLUMN_DEFAULTS = [[''], [''], [''], [''], [''],
                        [0.], [0], [0], [''], [0.],
                        [''], ['']]

def build_model_columns():
    age = tf.feature_column.numeric_column('Age')
    return [age]

def build_estimator():
    wide_columns = build_model_columns()
    hidden_units = [100, 75, 50, 25]
    
    run_config = tf.estimator.RunConfig().replace(
        session_config=tf.ConfigProto(device_count={'GPU': 0}))
    
    return tf.estimator.LinearClassifier(
        model_dir='/tmp/titanic-feature-engineering-3/',
        feature_columns=wide_columns,
        config=run_config)

def input_fn(data_file, num_epochs, shuffle, batch_size):
    """Generate an input function for the Estimator."""
    assert tf.gfile.Exists(data_file), (
        '%s not found. Please make sure you have either run data_download.py or '
        'set both arguments --train_data and --test_data.' % data_file)

    def parse_csv(value):
        print('Parsing', data_file)
        columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
        features = dict(zip(_CSV_COLUMNS, columns))
        labels = features.pop('Survived', tf.zeros_like(features['Age']))
        
        return features, tf.equal(labels, '1')

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])

    dataset = dataset.map(parse_csv, num_parallel_calls=5)

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)

    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels


# Main "model" training
train_epochs = 40
epochs_per_eval = 2
batch_size = 40
train_data = 'input/train.csv'
test_data = 'input/dev.csv'

model = build_estimator()
for n in range(train_epochs // epochs_per_eval):
    model.train(input_fn=lambda: input_fn(
        train_data, epochs_per_eval, True, batch_size))

    results = model.evaluate(input_fn=lambda: input_fn(
        test_data, 1, False, batch_size))

    # display results

    print('Results at epoch', (n + 1) * epochs_per_eval)
    print('-' * 60)

    for key in sorted(results):
        print('%s: %s' % (key, results[key]))

INFO:tensorflow:Using config: {'_model_dir': '/tmp/titanic-feature-engineering-3/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x117704470>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Parsing input/train.csv
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-3814
INFO:tensorflow:Saving checkpoints for 3815 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:loss = 28.3786, step = 3815
INFO:tensorflow:Saving checkpoints for 3850 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:Loss

Results at epoch 12
------------------------------------------------------------
accuracy: 0.642458
accuracy_baseline: 0.642458
auc: 0.540149
auc_precision_recall: 0.350196
average_loss: 0.655211
global_step: 4030
label/mean: 0.357542
loss: 23.4565
prediction/mean: 0.399465
Parsing input/train.csv
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4030
INFO:tensorflow:Saving checkpoints for 4031 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:loss = 27.8048, step = 4031
INFO:tensorflow:Saving checkpoints for 4066 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:Loss for final step: 14.5955.
Parsing input/dev.csv
INFO:tensorflow:Starting evaluation at 2018-01-18-14:54:26
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4066
INFO:tensorflow:Finished evaluation at 2018-01-18-14:54:27
INFO:tensorflow:Saving dict for global step 4066:

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4246
INFO:tensorflow:Saving checkpoints for 4247 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:loss = 28.2616, step = 4247
INFO:tensorflow:Saving checkpoints for 4282 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:Loss for final step: 15.8528.
Parsing input/dev.csv
INFO:tensorflow:Starting evaluation at 2018-01-18-14:54:58
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4282
INFO:tensorflow:Finished evaluation at 2018-01-18-14:54:59
INFO:tensorflow:Saving dict for global step 4282: accuracy = 0.642458, accuracy_baseline = 0.642458, auc = 0.537976, auc_precision_recall = 0.392178, average_loss = 0.655544, global_step = 4282, label/mean = 0.357542, loss = 23.4685, prediction/mean = 0.403293
Results at epoch 26
------------------------------------------------------------
accura

INFO:tensorflow:loss = 27.0719, step = 4463
INFO:tensorflow:Saving checkpoints for 4498 into /tmp/titanic-feature-engineering-3/model.ckpt.
INFO:tensorflow:Loss for final step: 16.4183.
Parsing input/dev.csv
INFO:tensorflow:Starting evaluation at 2018-01-18-14:55:26
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4498
INFO:tensorflow:Finished evaluation at 2018-01-18-14:55:27
INFO:tensorflow:Saving dict for global step 4498: accuracy = 0.642458, accuracy_baseline = 0.642458, auc = 0.448505, auc_precision_recall = 0.383222, average_loss = 0.657284, global_step = 4498, label/mean = 0.357542, loss = 23.5308, prediction/mean = 0.359216
Results at epoch 38
------------------------------------------------------------
accuracy: 0.642458
accuracy_baseline: 0.642458
auc: 0.448505
auc_precision_recall: 0.383222
average_loss: 0.657284
global_step: 4498
label/mean: 0.357542
loss: 23.5308
prediction/mean: 0.359216
Parsing input/train.csv
INFO:tensorflow:Creat

In [87]:
test_df = pd.read_csv('input/test.csv')
pall_pdf = np.array([test_df['Age'].fillna(0)])

print(pall_pdf.T.shape)

predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"Age": pall_pdf.T},
    num_epochs=1,
    shuffle=False)

predictions = model.predict(input_fn=predict_input_fn)
predicted_classes = [p["classes"][0].decode('utf8') for p in predictions]
sum((int(x) for x in predicted_classes))

(418, 1)
INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4534


0

In [83]:
predicted_classes = [p["classes"][0].decode('utf8') for p in predictions]

INFO:tensorflow:Restoring parameters from /tmp/titanic-feature-engineering-3/model.ckpt-4534


In [86]:
test_df['Age']

0      34.5
1      47.0
2      62.0
3      27.0
4      22.0
5      14.0
6      30.0
7      26.0
8      18.0
9      21.0
10      NaN
11     46.0
12     23.0
13     63.0
14     47.0
15     24.0
16     35.0
17     21.0
18     27.0
19     45.0
20     55.0
21      9.0
22      NaN
23     21.0
24     48.0
25     50.0
26     22.0
27     22.5
28     41.0
29      NaN
       ... 
388    21.0
389     6.0
390    23.0
391    51.0
392    13.0
393    47.0
394    29.0
395    18.0
396    24.0
397    48.0
398    22.0
399    31.0
400    30.0
401    38.0
402    22.0
403    17.0
404    43.0
405    20.0
406    23.0
407    50.0
408     NaN
409     3.0
410     NaN
411    37.0
412    28.0
413     NaN
414    39.0
415    38.5
416     NaN
417     NaN
Name: Age, Length: 418, dtype: float64

In [57]:
dataset

<TextLineDataset shapes: (), types: tf.string>

In [44]:
dir(model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_assert_members_are_not_overridden',
 '_call_input_fn',
 '_call_model_fn',
 '_config',
 '_convert_eval_steps_to_hooks',
 '_convert_train_steps_to_hooks',
 '_create_and_assert_global_step',
 '_create_global_step',
 '_device_fn',
 '_evaluate_model',
 '_extract_batch_length',
 '_extract_keys',
 '_get_features_and_labels_from_input_fn',
 '_get_features_from_input_fn',
 '_model_dir',
 '_model_fn',
 '_params',
 '_session_config',
 '_train_model',
 'config',
 'evaluate',
 'export_savedmodel',
 'get_variable_names',
 'get_variable_value',
 'latest_checkpoint',
 'model_dir',
 'model_fn',
 'params',
 'predict',
 'train']

In [39]:
predicted_classes = [p["classes"][0].decode('utf8') for p in predictions]

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [35]:
with open('input/train-no-header.csv', 'r') as read_file:
    with open('input/train.csv', 'w') as train_file:
        with open('input/dev.csv', 'w') as dev_file:
            for idx, line in enumerate(read_file):
                if idx < 712:
                    train_file.write(line)
                else:
                    dev_file.write(line)

In [23]:
with open('test-no-header.csv', 'r') as f:
    print(f.readline())

1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S



In [24]:
with open('test.csv', 'r') as f:
    print(f.readline())
    print(f.readline())

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked

1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S



In [4]:
columns = tf.decode_csv(csv_row, record_defaults=_CSV_COLUMN_DEFAULTS)

features = dict(zip(_CSV_COLUMNS, columns))
labels = features.pop('Survived')

In [None]:
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
    tf.initialize_all_variables().run()

    # start populating filename queue
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(file_length):
        # retrieve a single instance
        example, label = sess.run([features, col5])
        print(example, label)

    coord.request_stop()
    coord.join(threads)
    print("\ndone loading")