# Using the Dataset API from Tensorflow to read the data

### Importing required libraries

In [1]:
import tensorflow as tf
import numpy as np

### Global variables

In [2]:
CSV_COLUMNS = ['fare_amount', 'pickuplon','pickuplat','dropofflon','dropofflat','passengers', 'key']
LABEL_COLUMN = 'fare_amount'
DEFAULTS = [[0.0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]

### Defining input function

In [20]:
def read_dataset(filename, mode, batch_size = 512):
    def get_input_fn():
        def decode_line(column):
            columns = tf.decode_csv(column, record_defaults=DEFAULTS)
            features = dict(zip(CSV_COLUMNS, columns))
            label = features.pop(LABEL_COLUMN)
            return features, label
        
        files = tf.data.Dataset.list_files(file_pattern=filename)
        lines = files.flat_map(tf.data.TextLineDataset)
        dataset = lines.map(decode_line)
        
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1
        
        dataset = dataset.repeat(num_epochs).batch(batch_size)
        
        return dataset.make_one_shot_iterator().get_next()
    return get_input_fn
        

In [21]:
def get_train():
    return read_dataset('../Data/taxi-train.csv', mode = tf.estimator.ModeKeys.TRAIN)

def get_valid():
    return read_dataset('../Data/taxi-valid.csv', mode = tf.estimator.ModeKeys.EVAL)

def get_test():
    return read_dataset('../Data/taxi-test.csv', mode = tf.estimator.ModeKeys.EVAL)


In [22]:
INPUT_COLUMNS = [
    tf.feature_column.numeric_column('pickuplon'),
    tf.feature_column.numeric_column('pickuplat'),
    tf.feature_column.numeric_column('dropofflat'),
    tf.feature_column.numeric_column('dropofflon'),
    tf.feature_column.numeric_column('passengers'),
]

def add_more_features(feats):
  # Nothing to add (yet!)
  return feats

feature_cols = add_more_features(INPUT_COLUMNS)


In [23]:
tf.logging.set_verbosity(tf.logging.INFO)
model = tf.estimator.LinearRegressor(
      feature_columns = feature_cols)
model.train(input_fn = get_train(), steps = 1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0ab5935cd0>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmp5PHa0g', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp5PHa0g/model.ckpt.
INFO:tensorflow:loss = 102167.0, step = 1
INFO:tensorflow:global_step/sec: 43.2656
INFO:tensorflow:loss = 40883.7, step = 101 (2.313 sec)
INFO:tensorflow:global_step/sec: 39.4332
INFO:tensorflow:loss = 42026.3, step = 201 (2.536 sec)
INFO:tensorflow:global_step/sec: 43.4176
INFO:tensorflow:loss = 38638.1,

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x7f0ab58a4410>

In [24]:
def print_rmse(model, name, input_fn):
  metrics = model.evaluate(input_fn = input_fn, steps = 1)
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', get_valid())

INFO:tensorflow:Starting evaluation at 2018-06-18-16:57:53
INFO:tensorflow:Restoring parameters from /tmp/tmp5PHa0g/model.ckpt-1000
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-06-18-16:57:53
INFO:tensorflow:Saving dict for global step 1000: average_loss = 127.043, global_step = 1000, loss = 65046.1
RMSE on validation dataset = 11.2713451385
