In [79]:
import tensorflow as tf
from tensorflow.feature_column import embedding_column, numeric_column, categorical_column_with_identity
import pandas as pd
import matplotlib.pyplot as plt
print(tf.__version__)

1.13.1


In [80]:
TRAIN_PATH = "/Users/bryanwu/Desktop/taxi_fare_prediction/new-york-city-taxi-fare-prediction/train.csv"
df = pd.read_csv(TRAIN_PATH, nrows=1_000_000, parse_dates=['pickup_datetime'])

In [81]:
def preprocess(df):
    #remove negative fare_amount
    df = df[df['fare_amount'] > 0]
    
    #remove missing values
    df.dropna(how='any', axis='rows')
    
    #remove location outliers, ensuring locations are within the range seen in the train set
    def select_within_boundingbox(df, BB=(-74.5, -72.8, 40.5, 41.8)):
        return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
               (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
               (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
               (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
    
    df = df[select_within_boundingbox(df)]
    
    #remove trips in the water
    def remove_datapoints_from_water(df):
        def lonlat_to_xy(longitude, latitude, dx, dy, BB):
            return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
                   (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

        # define bounding box
        BB = (-74.5, -72.8, 40.5, 41.8)

        # read nyc mask and turn into boolean map with
        # land = True, water = False
        nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9

        # calculate for each lon,lat coordinate the xy coordinate in the mask map
        pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)
        dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)    
        # calculate boolean index
        idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

        # return only datapoints on land
        return df[idx]
    
    df = remove_datapoints_from_water(df)
    
    #Step 2: preprocess features
    weekday = df.pickup_datetime.apply(lambda t: t.weekday) 
    month = df.pickup_datetime.apply(lambda t: t.month) - 1
    hour = df.pickup_datetime.apply(lambda t: t.hour)
    
    #normalize continuous features
    df.pop('pickup_datetime')
    df.pop('key')
    mean = df.mean()
    std = df.std()
    df = (df-mean)/std
    #add new categorical columns
    df['weekday'] = weekday
    df['month'] = month
    df['hour'] = hour
    #drop pickup_datetime
    #drop key
    
    return df, (mean, std)
    

In [82]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,1000000.0,1000000.0,1000000.0,999990.0,999990.0,1000000.0
mean,11.348079,-72.52664,39.929008,-72.52786,39.919954,1.684924
std,9.82209,12.057937,7.626154,11.324494,8.201418,1.323911
min,-44.9,-3377.680935,-3116.285383,-3383.296608,-3114.338567,0.0
25%,6.0,-73.99206,40.734965,-73.991385,40.734046,1.0
50%,8.5,-73.981792,40.752695,-73.980135,40.753166,1.0
75%,12.5,-73.967094,40.767154,-73.963654,40.768129,2.0
max,500.0,2522.271325,2621.62843,45.581619,1651.553433,208.0


In [83]:
df.dtypes

key                               object
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [84]:
df, (mean, std) = preprocess(df)
target = df.pop('fare_amount')
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,weekday,month,hour
0,3.404932,-1.01039,3.529462,-1.198895,-0.52382,0,5,17
1,-1.065424,-1.349859,-0.134046,0.934117,-0.52382,1,0,16
2,-0.198357,0.343654,-0.452711,-0.027736,0.240877,3,7,0
3,-0.312682,-0.609644,-0.46136,0.202616,-0.52382,5,3,4
4,0.182804,0.572023,0.467757,0.987896,-0.52382,1,2,7


In [74]:
df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,weekday,month,hour
count,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0
mean,-1.436718e-12,1.696763e-12,-1.681276e-12,1.316297e-12,-4.574309e-17,3.022583,5.196301,13.523299
std,1.0,1.0,1.0,1.0,1.0,1.974027,3.425802,6.497538
min,-10.81557,-7.68022,-11.57908,-7.453599,-1.294011,0.0,0.0,0.0
25%,-0.4097441,-0.4707136,-0.4415501,-0.4585489,-0.5080632,1.0,2.0,9.0
50%,-0.1726986,0.05665008,-0.1677201,0.06551144,-0.5080632,3.0,5.0,14.0
75%,0.1494505,0.5033506,0.2168226,0.4823632,0.2778847,5.0,8.0,19.0
max,19.53614,18.76252,21.26904,18.1397,3.421676,6.0,11.0,23.0


In [75]:
input_fn_train = tf.estimator.inputs.pandas_input_fn(x=df, y=target, batch_size=16, num_epochs=50, shuffle=True, target_column="fare_amount")

In [76]:
feature_columns = [
    tf.feature_column.numeric_column('pickup_longitude'),
    tf.feature_column.numeric_column('pickup_latitude'),
    tf.feature_column.numeric_column('dropoff_longitude'),
    tf.feature_column.numeric_column('dropoff_latitude'),
    tf.feature_column.numeric_column('passenger_count'),
    embedding_column(categorical_column_with_identity('weekday', num_buckets=7), dimension=3),
    embedding_column(categorical_column_with_identity('month', num_buckets=12), dimension=3),
    embedding_column(categorical_column_with_identity('hour', num_buckets=24), dimension=5)
]
feature_columns

[NumericColumn(key='pickup_longitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='pickup_latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='dropoff_longitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='dropoff_latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='passenger_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='weekday', number_buckets=7, default_value=None), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0xb375d1320>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='month', number_buckets=12, default_value=None), dimension=3, combiner='m

In [77]:
est = tf.estimator.DNNRegressor(
    feature_columns=feature_columns, 
    hidden_units=[1024, 512, 256],
    optimizer=tf.train.AdamOptimizer()
)



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmpg2bqngsi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb375d1b38>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [78]:
est.train(input_fn=input_fn_train)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmpg2bqngsi/model.ckpt.
INFO:tensorflow:loss = 37.052, step = 1
INFO:tensorflow:global_step/sec: 181.608
INFO:tensorflow:loss = 1.4527016, step = 101 (0.551 sec)
INFO:tensorflow:global_step/sec: 287.778
INFO:tensorflow:loss = 1.1286051, step = 201 (0.348 sec)
INFO:tensorflow:global_step/sec: 276.553
INFO:tensorflow:loss = 3.0729916, step = 301 (0.361 sec)
INFO:tensorflow:global_step/sec: 289.464
INFO:tensorflow:loss = 5.9781485, step = 401 (0.346 sec)
INFO:tensorflow:global_step/sec: 296.813
INFO:tensorflow:loss = 1.1472969, step = 501 (0.337 sec)
INFO:tensorflow:global_step/sec: 290.23
INFO:tensorflow:loss = 1.40319, step = 601 (0.344 sec)
INFO:tensorflow:glo

INFO:tensorflow:loss = 1.4147434, step = 8001 (0.530 sec)
INFO:tensorflow:global_step/sec: 86.3668
INFO:tensorflow:loss = 0.6971652, step = 8101 (1.153 sec)
INFO:tensorflow:global_step/sec: 144.368
INFO:tensorflow:loss = 0.9412252, step = 8201 (0.692 sec)
INFO:tensorflow:global_step/sec: 188.488
INFO:tensorflow:loss = 0.7092048, step = 8301 (0.530 sec)
INFO:tensorflow:global_step/sec: 263.489
INFO:tensorflow:loss = 0.90203065, step = 8401 (0.379 sec)
INFO:tensorflow:global_step/sec: 171.95
INFO:tensorflow:loss = 0.43261763, step = 8501 (0.581 sec)
INFO:tensorflow:global_step/sec: 213.674
INFO:tensorflow:loss = 0.6602522, step = 8601 (0.469 sec)
INFO:tensorflow:global_step/sec: 189.601
INFO:tensorflow:loss = 1.5433037, step = 8701 (0.528 sec)
INFO:tensorflow:global_step/sec: 188.801
INFO:tensorflow:loss = 2.2120528, step = 8801 (0.529 sec)
INFO:tensorflow:global_step/sec: 210.172
INFO:tensorflow:loss = 0.35104588, step = 8901 (0.476 sec)
INFO:tensorflow:global_step/sec: 173.795
INFO:ten

INFO:tensorflow:global_step/sec: 275.355
INFO:tensorflow:loss = 0.7824408, step = 16301 (0.363 sec)
INFO:tensorflow:global_step/sec: 281.473
INFO:tensorflow:loss = 0.986133, step = 16401 (0.355 sec)
INFO:tensorflow:global_step/sec: 296.184
INFO:tensorflow:loss = 0.89785457, step = 16501 (0.338 sec)
INFO:tensorflow:global_step/sec: 286.632
INFO:tensorflow:loss = 0.8776952, step = 16601 (0.349 sec)
INFO:tensorflow:global_step/sec: 293.778
INFO:tensorflow:loss = 0.5230497, step = 16701 (0.340 sec)
INFO:tensorflow:global_step/sec: 287.393
INFO:tensorflow:loss = 1.1283033, step = 16801 (0.348 sec)
INFO:tensorflow:global_step/sec: 291.938
INFO:tensorflow:loss = 36.71952, step = 16901 (0.343 sec)
INFO:tensorflow:global_step/sec: 300.491
INFO:tensorflow:loss = 2.4781132, step = 17001 (0.333 sec)
INFO:tensorflow:global_step/sec: 283.618
INFO:tensorflow:loss = 200.52203, step = 17101 (0.353 sec)
INFO:tensorflow:global_step/sec: 260.756
INFO:tensorflow:loss = 2.9667828, step = 17201 (0.384 sec)
I

INFO:tensorflow:global_step/sec: 292.505
INFO:tensorflow:loss = 0.3721305, step = 24501 (0.342 sec)
INFO:tensorflow:global_step/sec: 272.128
INFO:tensorflow:loss = 0.35736883, step = 24601 (0.367 sec)
INFO:tensorflow:global_step/sec: 253.247
INFO:tensorflow:loss = 1.7335577, step = 24701 (0.395 sec)
INFO:tensorflow:global_step/sec: 271.429
INFO:tensorflow:loss = 0.21963948, step = 24801 (0.369 sec)
INFO:tensorflow:global_step/sec: 266.299
INFO:tensorflow:loss = 1.0317876, step = 24901 (0.375 sec)
INFO:tensorflow:global_step/sec: 219.009
INFO:tensorflow:loss = 0.97348595, step = 25001 (0.457 sec)
INFO:tensorflow:global_step/sec: 284.375
INFO:tensorflow:loss = 0.54867554, step = 25101 (0.352 sec)
INFO:tensorflow:global_step/sec: 218.018
INFO:tensorflow:loss = 0.45679334, step = 25201 (0.459 sec)
INFO:tensorflow:global_step/sec: 263.662
INFO:tensorflow:loss = 0.5187077, step = 25301 (0.378 sec)
INFO:tensorflow:global_step/sec: 252.876
INFO:tensorflow:loss = 0.86812985, step = 25401 (0.398

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor at 0xb2ed219b0>

In [244]:
TEST_PATH = "/Users/bryanwu/Desktop/taxi_fare_prediction/new-york-city-taxi-fare-prediction/train.csv"
df_test = pd.read_csv(TEST_PATH, skiprows=range(1,1_000_000), parse_dates=['pickup_datetime'], nrows=2000)
df_test.head()
df_test, _ = preprocess(df_test)
target_test = df_test.pop('fare_amount')

In [245]:
input_fn_test = tf.estimator.inputs.pandas_input_fn(x=df_test, 
                                                     y=target_test,  
                                                    shuffle=False,
                                                    batch_size=2000,
                                                     target_column="fare_amount"
                                                    )

In [246]:
predictions = est.predict(
    input_fn_test,
    predict_keys=None,
    hooks=None,
    checkpoint_path=None,
    yield_single_examples=False
)

In [247]:
predictions = list(predictions)[0]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmpg2bqngsi/model.ckpt-30582
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [248]:
predictions

{'predictions': array([[-0.6139192 ],
        [ 0.16843894],
        [-0.6945644 ],
        ...,
        [-0.44352132],
        [ 0.10787749],
        [ 0.9651627 ]], dtype=float32)}

In [249]:
predictions = predictions['predictions']

In [250]:
predictions

array([[-0.6139192 ],
       [ 0.16843894],
       [-0.6945644 ],
       ...,
       [-0.44352132],
       [ 0.10787749],
       [ 0.9651627 ]], dtype=float32)

In [251]:
mean.astype('float32')
std.astype('float32')

fare_amount          9.714552
pickup_longitude     0.038417
pickup_latitude      0.029505
dropoff_longitude    0.037575
dropoff_latitude     0.032689
passenger_count      1.307707
dtype: float32

In [252]:
predictions = predictions*std['fare_amount']+mean['fare_amount']

In [253]:
predictions

array([[ 5.3654404],
       [12.965699 ],
       [ 4.5820084],
       ...,
       [ 7.0207796],
       [12.377372 ],
       [20.705513 ]], dtype=float32)

In [254]:
len(predictions)

1967

In [255]:
target_test_tensor = tf.dtypes.cast(tf.constant(target_test), tf.float32)*std['fare_amount']+mean['fare_amount']
target_test_tensor

<tf.Tensor 'add:0' shape=(1967,) dtype=float32>

In [256]:
import numpy as np
target_test_tensor.shape
np.squeeze(predictions).shape

(1967,)

In [257]:
rsme = tf.metrics.mean_squared_error(labels=target_test_tensor, predictions=predictions)

In [258]:
from tensorflow import keras
rsme = keras.losses.mean_squared_error(np.squeeze(predictions), target_test_tensor)

In [259]:
session = tf.Session()

In [264]:
np.sqrt(session.run(rsme))

6.5826764

In [261]:
sess.run(target_test_tensor[:10])

array([ 3.9167438, 10.309787 ,  3.490541 , 11.268743 ,  6.9001637,
       21.39106  , 17.129032 ,  5.6215553, 22.456568 ,  6.9001637],
      dtype=float32)

In [263]:
predictions[:10]

array([[ 5.3654404],
       [12.965699 ],
       [ 4.5820084],
       [10.853432 ],
       [ 8.89147  ],
       [22.39938  ],
       [14.651457 ],
       [ 6.934065 ],
       [12.235401 ],
       [ 4.8407574]], dtype=float32)