In [1]:
%%bash
python --version
pip --version

Python 3.7.3
pip 19.0.3 from /Users/bryanwu/anaconda/lib/python3.7/site-packages/pip (python 3.7)


In [2]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from math import sin, cos, atan2, sqrt
from datetime import datetime
import matplotlib.pyplot as plt
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import tensorflow as tf
import tempfile
import os
from tensorflow_transform.coders import example_proto_coder
from apache_beam.io import tfrecordio
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
import random

  'Some syntactic constructs of Python 3 are not yet fully supported by '


In [3]:
class ProcessCSV(beam.DoFn):
    def process(self, element):
        key, fare_amount, pickup_datetime, pickup_longitude, pickup_latitude, \
        dropoff_longitude, dropoff_latitude, passenger_count = element.split(',')
        return [{
            'fare_amount': fare_amount,
            'pickup_datetime': pickup_datetime,
            'pickup_longitude': pickup_longitude,
            'pickup_latitude': pickup_latitude,
            'dropoff_longitude': dropoff_longitude,
            'dropoff_latitude': dropoff_latitude
        }]

In [4]:
class CalculateDistance(beam.DoFn):
    def process(self, element):
        pickup_longitude = float(element['pickup_longitude'])
        pickup_latitude = float(element['pickup_latitude'])
        dropoff_longitude = float(element['dropoff_longitude'])
        dropoff_latitude = float(element['dropoff_latitude'])
        del_longitude = pickup_longitude - dropoff_longitude
        del_latitude = pickup_latitude - dropoff_latitude
        a = sin(del_latitude/2)**2 + cos(pickup_latitude)*cos(dropoff_latitude)*sin(del_longitude/2)**2
        c = 2*atan2(sqrt(a), sqrt(1-a))
        R = 6371.0
        d = R * c
        element['distance'] = d
        return [element]

In [5]:
class FilterNoisyDataPoint(beam.DoFn):
    def process(self, element):
        def is_within_boundingbox(element, BB=(-74.5, -72.8, 40.5, 41.8)):
            pickup_longitude = float(element['pickup_longitude'])
            pickup_latitude = float(element['pickup_latitude'])
            dropoff_longitude = float(element['dropoff_longitude'])
            dropoff_latitude = float(element['dropoff_latitude'])
            return (pickup_longitude >= BB[0]) & (pickup_longitude <= BB[1]) & \
                   (pickup_latitude >= BB[2]) & (pickup_latitude <= BB[3]) & \
                   (dropoff_longitude >= BB[0]) & (dropoff_longitude <= BB[1]) & \
                   (dropoff_latitude >= BB[2]) & (dropoff_latitude <= BB[3])
        
        if is_within_boundingbox(element):
            return [element]

In [6]:
class ExtractDateTime(beam.DoFn):
    def process(self, element):
        dt = datetime.strptime(element['pickup_datetime'], '%Y-%m-%d %H:%M:%S UTC')
        element['hour'] = dt.hour
        element['month'] = dt.month
        element['week_number'] = dt.isocalendar()[1]
        element['weekday'] = dt.weekday()
        return [element]

In [7]:
class MergeToString(beam.DoFn):
    def process(self, element):
        column_names = ['fare_amount', 
                        'week_number', 
                        'weekday', 'hour', 
                        'pickup_longitude', 
                        'pickup_latitude', 
                        'dropoff_longitude',
                        'dropoff_latitude',
                        'distance']
        
        return [','.join(['{}']*len(column_names)).format(*[element[column] for column in column_names])]
        

In [8]:
def normalize(element):
    fare_amount = element['fare_amount']
    pickup_longitude = tft.scale_to_0_1(element['pickup_longitude'])
    pickup_latitude = tft.scale_to_0_1(element['pickup_latitude'])
    dropoff_longitude = tft.scale_to_0_1(element['dropoff_longitude'])
    dropoff_latitude = tft.scale_to_0_1(element['dropoff_latitude'])
    distance = tft.scale_to_0_1(element['distance'])
    hour = element['hour']
    month = element['month']
    week_number = element['week_number']
    weekday = element['weekday']
    return {
            'fare_amount': fare_amount,
            'pickup_longitude': pickup_longitude,
            'pickup_latitude': pickup_latitude,
            'dropoff_longitude': dropoff_longitude,
            'dropoff_latitude': dropoff_latitude,
            'distance': distance,
            'hour': hour,
            'month' : month,
            'week_number': week_number,
            'weekday': weekday
        }
    
    
    
    

In [9]:
opts = PipelineOptions(
    runner='direct'
)
pipe = beam.Pipeline(options=opts)

In [10]:
file_location = './new-york-city-taxi-fare-prediction/train_subset.csv'
column_names = ['fare_amount', 
                'week_number', 
                'weekday', 
                'hour', 
                'month',
                'pickup_longitude', 
                'pickup_latitude', 
                'dropoff_longitude',
                'dropoff_latitude',
                'distance']

raw_data = (
        pipe 
         | 'ReadFile' >> beam.io.ReadFromText(file_location, skip_header_lines=1)
         | 'SplitCSV' >> beam.ParDo(ProcessCSV())
         | 'CalculateDistance' >> beam.ParDo(CalculateDistance())
         | 'FilterNoisyDataPoints' >> beam.ParDo(FilterNoisyDataPoint())
         | 'ExtractDateTime' >> beam.ParDo(ExtractDateTime())
#          | 'MergeToString' >> beam.ParDo(MergeToString())
#          | 'WriteToGCS' >> beam.io.WriteToText('./tmp/transformed_train.csv', header=', '.join(column_names))
        )

raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec({
            'fare_amount': tf.FixedLenFeature([], tf.float32),
            'pickup_longitude': tf.FixedLenFeature([], tf.float32),
            'pickup_latitude': tf.FixedLenFeature([], tf.float32),
            'dropoff_longitude': tf.FixedLenFeature([], tf.float32),
            'dropoff_latitude': tf.FixedLenFeature([], tf.float32),
            'distance': tf.FixedLenFeature([], tf.float32),
            'hour': tf.FixedLenFeature([], tf.int64),
            'month': tf.FixedLenFeature([], tf.int64),
            'week_number': tf.FixedLenFeature([], tf.int64),
            'weekday': tf.FixedLenFeature([], tf.int64)
}))


with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    dataset_and_metadata, transform_fn = (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
                normalize)
    
    dataset, metadata = dataset_and_metadata
    eval_percent = 10
    train_dataset, eval_dataset = (
        dataset
        | 'Split dataset' >> beam.Partition(
            lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
    
    coder = example_proto_coder.ExampleProtoCoder(metadata.schema)
    
    
    train_dataset_dir = os.path.join('dataset', 'train')
    eval_dataset_dir = os.path.join('dataset', 'eval')
    work_dir = os.path.join('dataset', 'transform_fn')
    
    train_dataset_prefix = os.path.join(train_dataset_dir, 'train')
    _ = (
        train_dataset
        | 'Write train dataset' >> tfrecordio.WriteToTFRecord(
            train_dataset_prefix, coder))

    eval_dataset_prefix = os.path.join(eval_dataset_dir, 'eval')
    _ = (
        eval_dataset
        | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(
            eval_dataset_prefix, coder))

    # Write the transform_fn
    _ = (
        transform_fn
        | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir))
    # [END dataflow_molecules_write_tfrecords]

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmplwh7_83u/tftransform_tmp/802458ea30774a659a41d9313a9bd37e/saved_model.pb
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmplwh7_83u/tftransform_tmp/1bb4da4af7dd4123b850ec9c1d49e5eb/saved_model.pb


In [11]:
pipe.run()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmplwh7_83u/tftransform_tmp/b6789422d4de40fca7068005985dfc1a/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmplwh7_83u/tftransform_tmp/b6789422d4de40fca7068005985dfc1a/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


<apache_beam.runners.portability.fn_api_runner.RunnerResult at 0xb3bc3a1d0>