In [265]:
import tensorflow as tf
from tensorflow.feature_column import embedding_column, numeric_column, categorical_column_with_identity
import pandas as pd
import matplotlib.pyplot as plt
print(tf.__version__)

1.13.1


In [None]:
TRAIN_PATH = "/Users/bryanwu/Desktop/taxi_fare_prediction/new-york-city-taxi-fare-prediction/train.csv"
df = pd.read_csv(TRAIN_PATH, nrows=1_000_000, parse_dates=['pickup_datetime'])

In [None]:
def preprocess(df):
    #remove negative fare_amount
    df = df[df['fare_amount'] > 0]
    
    #remove missing values
    df.dropna(how='any', axis='rows')
    
    #remove location outliers, ensuring locations are within the range seen in the train set
    def select_within_boundingbox(df, BB=(-74.5, -72.8, 40.5, 41.8)):
        return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
               (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
               (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
               (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
    
    df = df[select_within_boundingbox(df)]
    
    #remove trips in the water
    def remove_datapoints_from_water(df):
        def lonlat_to_xy(longitude, latitude, dx, dy, BB):
            return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
                   (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

        # define bounding box
        BB = (-74.5, -72.8, 40.5, 41.8)

        # read nyc mask and turn into boolean map with
        # land = True, water = False
        nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9

        # calculate for each lon,lat coordinate the xy coordinate in the mask map
        pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)
        dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)    
        # calculate boolean index
        idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

        # return only datapoints on land
        return df[idx]
    
    df = remove_datapoints_from_water(df)
    
    #Step 2: preprocess features
    weekday = df.pickup_datetime.apply(lambda t: t.weekday) 
    month = df.pickup_datetime.apply(lambda t: t.month) - 1
    hour = df.pickup_datetime.apply(lambda t: t.hour)
    minute = df.pickup_datetime.apply(lambda t: t.minute)
    dayofyear= df.pickup_datetime.apply(lambda t: t.dayofyear)
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    #translate longitude/latitude coordinate into image xy coordinate
    def lonlat_to_xy(longitude, latitude, dx=126, dy=124, BB=(-74.5, -72.8, 40.5, 41.8)):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
    
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude)
    
    #keep numeric feature only
    df.pop('pickup_datetime')
    df.pop('key')
    df.pop('passenger_count')
    df['time_in_day'] = hour*60.0 + minute
    df['pickup_x'] = pickup_x
    df['pickup_y'] = pickup_y
    df['dropoff_x'] = dropoff_x
    df['dropoff_y'] = dropoff_y
    
    #normalize df
    mean = df.mean()
    std = df.std()
    #df = (df-mean)/std
    #add new categorical columns
    df['weekday'] = weekday
    df['month'] = month
    df['hour'] = hour
    df['minute'] = minute
    df['dayofyear'] = dayofyear
    
    
    return df, (mean, std)
    

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df, (mean, std) = preprocess(df)
target = df.pop('fare_amount')
df.head()

In [None]:
df.describe()

In [None]:
input_fn_train = tf.estimator.inputs.pandas_input_fn(x=df, y=target, batch_size=16, num_epochs=100, shuffle=True, target_column="fare_amount")

In [None]:
def build_feature_columns():
    pickup_longitude = tf.feature_column.numeric_column('pickup_longitude')
    pickup_latitude = tf.feature_column.numeric_column('pickup_latitude')
    dropoff_longitude = tf.feature_column.numeric_column('dropoff_longitude')
    dropoff_latitude = tf.feature_column.numeric_column('dropoff_latitude')
    
    base_columns = [
        tf.feature_column.categorical_column_with_identity('month', 12),
        tf.feature_column.categorical_column_with_identity('hour', 24),
        tf.feature_column.categorical_column_with_identity('weekday', 7),
        tf.feature_column.bucketized_column(tf.feature_column.numeric_column('minute'), list(range(0,60,10)))
    ]
    
    pickup_x = tf.feature_column.categorical_column_with_identity('pickup_x', 126)
    pickup_y = tf.feature_column.categorical_column_with_identity('pickup_y', 124)
    dropoff_x = tf.feature_column.categorical_column_with_identity('dropoff_x', 126)
    dropoff_y = tf.feature_column.categorical_column_with_identity('dropoff_y', 124)
    
    cross_columns = [
        tf.feature_column.crossed_column([pickup_x, pickup_y], 126*124),
        tf.feature_column.crossed_column([dropoff_x, dropoff_y], 126*124)
    ]
    
    wide_columns = base_columns + cross_columns
    
    time_in_day = tf.feature_column.numeric_column('time_in_day')
    dayofyear = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('dayofyear'), list(range(1, 365, 7)))
    deep_columns = [
        tf.feature_column.numeric_column('pickup_longitude'),
        tf.feature_column.numeric_column('pickup_latitude'),
        tf.feature_column.numeric_column('dropoff_longitude'),
        tf.feature_column.numeric_column('dropoff_latitude'),
        embedding_column(dayofyear, dimension=16),
        embedding_column(tf.feature_column.bucketized_column(time_in_day, boundaries=list(range(0,24*60, 15))), dimension=32)
    ]
    
    return wide_columns, deep_columns

In [None]:
# feature_columns = [
#     tf.feature_column.numeric_column('pickup_longitude'),
#     tf.feature_column.numeric_column('pickup_latitude'),
#     tf.feature_column.numeric_column('dropoff_longitude'),
#     tf.feature_column.numeric_column('dropoff_latitude'),
#     tf.feature_column.numeric_column('passenger_count'),
#     embedding_column(categorical_column_with_identity('weekday', num_buckets=7), dimension=3),
#     embedding_column(categorical_column_with_identity('month', num_buckets=12), dimension=3),
#     embedding_column(categorical_column_with_identity('hour', num_buckets=24), dimension=5)
# ]
# feature_columns

In [None]:
# wide_columns, deep_columns = build_feature_columns()
# print(wide_columns, deep_columns)

In [None]:
def build_estimator(fc_fn):
    wide_columns, deep_columns = fc_fn()
    hidden_units = [100, 75, 50, 25]
    return tf.estimator.DNNLinearCombinedRegressor(
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units)
est = build_estimator(build_feature_columns)

In [None]:
# est = tf.estimator.DNNRegressor(
#     feature_columns=feature_columns, 
#     hidden_units=[1024, 512, 256],
#     optimizer=tf.train.AdamOptimizer()
# )



In [None]:
est.train(input_fn=input_fn_train)


In [376]:
TEST_PATH = "/Users/bryanwu/Desktop/taxi_fare_prediction/new-york-city-taxi-fare-prediction/train.csv"
df_test = pd.read_csv(TEST_PATH, skiprows=range(1,1_000_000), parse_dates=['pickup_datetime'], nrows=2000)
df_test.head()
df_test, _ = preprocess(df_test)
target_test = df_test.pop('fare_amount')

In [245]:
input_fn_test = tf.estimator.inputs.pandas_input_fn(x=df_test, 
                                                     y=target_test,  
                                                    shuffle=False,
                                                    batch_size=2000,
                                                     target_column="fare_amount"
                                                    )

In [246]:
predictions = est.predict(
    input_fn_test,
    predict_keys=None,
    hooks=None,
    checkpoint_path=None,
    yield_single_examples=False
)

In [247]:
predictions = list(predictions)[0]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/wt/dv0qlxcn5j9b0h9vd8tr23bh0000gn/T/tmpg2bqngsi/model.ckpt-30582
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [248]:
predictions

{'predictions': array([[-0.6139192 ],
        [ 0.16843894],
        [-0.6945644 ],
        ...,
        [-0.44352132],
        [ 0.10787749],
        [ 0.9651627 ]], dtype=float32)}

In [249]:
predictions = predictions['predictions']

In [250]:
predictions

array([[-0.6139192 ],
       [ 0.16843894],
       [-0.6945644 ],
       ...,
       [-0.44352132],
       [ 0.10787749],
       [ 0.9651627 ]], dtype=float32)

In [251]:
mean.astype('float32')
std.astype('float32')

fare_amount          9.714552
pickup_longitude     0.038417
pickup_latitude      0.029505
dropoff_longitude    0.037575
dropoff_latitude     0.032689
passenger_count      1.307707
dtype: float32

In [252]:
predictions = predictions*std['fare_amount']+mean['fare_amount']

In [253]:
predictions

array([[ 5.3654404],
       [12.965699 ],
       [ 4.5820084],
       ...,
       [ 7.0207796],
       [12.377372 ],
       [20.705513 ]], dtype=float32)

In [254]:
len(predictions)

1967

In [255]:
target_test_tensor = tf.dtypes.cast(tf.constant(target_test), tf.float32)*std['fare_amount']+mean['fare_amount']
target_test_tensor

<tf.Tensor 'add:0' shape=(1967,) dtype=float32>

In [256]:
import numpy as np
target_test_tensor.shape
np.squeeze(predictions).shape

(1967,)

In [257]:
rsme = tf.metrics.mean_squared_error(labels=target_test_tensor, predictions=predictions)

In [258]:
from tensorflow import keras
rsme = keras.losses.mean_squared_error(np.squeeze(predictions), target_test_tensor)

In [259]:
session = tf.Session()

In [264]:
np.sqrt(session.run(rsme))

6.5826764

In [261]:
sess.run(target_test_tensor[:10])

array([ 3.9167438, 10.309787 ,  3.490541 , 11.268743 ,  6.9001637,
       21.39106  , 17.129032 ,  5.6215553, 22.456568 ,  6.9001637],
      dtype=float32)

In [263]:
predictions[:10]

array([[ 5.3654404],
       [12.965699 ],
       [ 4.5820084],
       [10.853432 ],
       [ 8.89147  ],
       [22.39938  ],
       [14.651457 ],
       [ 6.934065 ],
       [12.235401 ],
       [ 4.8407574]], dtype=float32)