# load and process data

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import numpy as np
import tensorflow as tf

## CSV

In [2]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [4]:
# make numpy values easier to read
np.set_printoptions(precision=3, suppress=True)

In [10]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [11]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
    return dataset

In [12]:
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [21]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key,value.numpy()))

In [22]:
show_batch(raw_train_data)

sex                 : [b'male' b'male' b'female' b'male' b'male']
age                 : [28.5 21.  28.  28.  28. ]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 0 0 0 0]
fare                : [ 7.229  7.796  8.05  26.    30.696]
class               : [b'Third' b'Third' b'Third' b'First' b'First']
deck                : [b'unknown' b'unknown' b'unknown' b'A' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Cherbourg']
alone               : [b'y' b'y' b'y' b'y' b'y']


In [15]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

In [16]:
show_batch(temp_dataset)

sex                 : [b'male' b'female' b'male' b'male' b'male']
age                 : [28. 41. 24. 18. 19.]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 1 0 0 0]
fare                : [56.496 19.5   79.2   13.     7.65 ]
class               : [b'Third' b'Second' b'First' b'Second' b'Third']
deck                : [b'unknown' b'unknown' b'B' b'unknown' b'F']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'y' b'y' b'y']


In [17]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

In [18]:
show_batch(temp_dataset)

age                 : [30. 41. 18. 17. 49.]
n_siblings_spouses  : [0 2 0 1 0]
class               : [b'Third' b'Third' b'Second' b'First' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'C' b'unknown']
alone               : [b'y' b'n' b'n' b'n' b'y']
