In [1]:
%bash
git clone https://github.com/GoogleCloudPlatform/training-data-analyst
rm -rf training-data-analyst/.git

Cloning into 'training-data-analyst'...


In [3]:
import datalab.bigquery as bq
import tensorflow as tf
import numpy as np
import shutil
print("tf version: ", tf.__version__)

tf version:  1.8.0


<h2> Load data datasets progressively with tf.data.Dataset API</h2>
    - read large dataset from disk progressively
    - refactor the feature creation so that it is not one-to-one with inputs. 
    - reading the whole data into memory won't be good for a large dataset 

<h3> Refactor the input</h3>
    - data will be delivered to the model, during training, in mini-bactches when needed from disc.

In [5]:
# features and label
CSV_COLUMNS = ['fare_amount', 'pickuplon','pickuplat','dropofflon','dropofflat','passengers', 'key']
LABEL_COLUMN = 'fare_amount'
DEFAULTS = [[0.0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]

# decode csv file
def decode_csv(value_column):
  columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)    # decode text
  features = dict(zip(CSV_COLUMNS, columns))    # assign col names
  label = features.pop(LABEL_COLUMN)   # label is the 'fare_amount'
  return features, label

# prepare data for progressive loading
# create input function for the model-- this will be called at model instantiated
# tf works in deffered execution, this function would serve as a node and return fresh bacth of data every time 
# it will help to avoid saturation of memeory

def input_fn():
  # create list of file names that macych pattern (i.e., dafa_file_*.csv)
  filenames_dataset = tf.data.Dataset.list_files(filename)
  # read lines from text files
  # apply one to many transformations (here: filename -> text lines)
  text_line_dataset = filenames_dataset.flat_map(tf.data.TextLineDataset)
  # parse text lines a csv
  # apply one to one  transformations (here: text line -> feature list)
  dataset = text_line_dataset.map(decode_csv())
  
  if mode == tf.estimator.ModeKeys.TRAIN:
    num_epochs = None # indefinetly
    dataset = dataset.shuffle(buffer_size = 10 * batch_size)
  else:
    num_epochs = 1 # this is for validation
  
  dataset = dataset.repeat(num_epochs).batch(batch_size)
  
  return dataset.make_one_shot_iterator().get_next()

# call input_fn
def read_dataset(filename, mode, batch_size = 512):
  return input_fn()


In [23]:
# get train and valid tf data
PATH = './training-data-analyst/courses/machine_learning/deepdive/03_tensorflow/04_features/'
def get_train():
  return read_dataset(PATH+'taxi-train.csv', mode = tf.estimator.ModeKeys.TRAIN)

def get_valid():
  return read_dataset(PATH+'taxi-valid.csv', mode = tf.estimator.ModeKeys.EVAL)

def get_test():
  return read_dataset(PATH+'taxi-test.csv', mode = tf.estimator.ModeKeys.EVAL)

In [27]:
print(PATH+'taxi-train.csv')

./training-data-analyst/courses/machine_learning/deepdive/03_tensorflow/04_features/taxi-train.csv


<h2> Refactor the way features are created. </h2>

refactoring this way will enable us to break the one-to-one relationship between inputs and features.

In [25]:
INPUT_COLUMNS = [
    tf.feature_column.numeric_column('pickuplon'),
    tf.feature_column.numeric_column('pickuplat'),
    tf.feature_column.numeric_column('dropofflat'),
    tf.feature_column.numeric_column('dropofflon'),
    tf.feature_column.numeric_column('passengers'),
]

def add_more_features(feats):
  # Nothing to add (yet!)
  return feats

feature_cols = add_more_features(INPUT_COLUMNS)

<h2> Create and train model </h2>

In [26]:
tf.logging.set_verbosity(tf.logging.INFO)
OUTDIR = 'taxi_trained'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
model = tf.estimator.LinearRegressor(
      feature_columns = feature_cols, model_dir = OUTDIR)
model.train(input_fn = get_train(), steps = 1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'taxi_trained', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f76f53ea3c8>, '_service': None, '_global_id_in_cluster': 0, '_save_checkpoints_steps': None, '_session_config': None, '_train_distribute': None, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_evaluation_master': '', '_num_worker_replicas': 1, '_is_chief': True, '_num_ps_replicas': 0, '_master': '', '_save_summary_steps': 100, '_task_type': 'worker', '_task_id': 0}


NameError: name 'filename' is not defined

<h2> Evaluate model </h2>

In [None]:
def print_rmse(model, name, input_fn):
  metrics = model.evaluate(input_fn=input_fn, step = 1)
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', get_valid)

## Challenge Exercise

Create a neural network that is capable of finding the volume of a cylinder given the radius of its base (r) and its height (h). Assume that the radius and height of the cylinder are both in the range 0.5 to 2.0. Unlike in the challenge exercise for b_estimator.ipynb, assume that your measurements of r, h and V are all rounded off to the nearest 0.1. Simulate the necessary training dataset. This time, you will need a lot more data to get a good predictor.

Hint (highlight to see):
<p style='color:white'>
Create random values for r and h and compute V. Then, round off r, h and V (i.e., the volume is computed from the true value of r and h; it's only your measurement that is rounded off). Your dataset will consist of the round values of r, h and V. Do this for both the training and evaluation datasets.
</p>

Now modify the "noise" so that instead of just rounding off the value, there is up to a 10% error (uniformly distributed) in the measurement followed by rounding off.