In [None]:
#reference: https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration#New-York-City-Taxi-Fare-Prediction-Playground-Competition

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

In [9]:
TRAIN_FILE = './new-york-city-taxi-fare-prediction/train.csv'
df_train = pd.read_csv(TRAIN_FILE, nrows=100_000, parse_dates=['pickup_datetime'])

In [10]:
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1


In [11]:
df_train.dtypes

key                               object
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [12]:
df_train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,11.354652,-72.494682,39.914481,-72.490967,39.919053,1.67382
std,9.716777,10.693934,6.225686,10.471386,6.213427,1.300171
min,-44.9,-736.55,-74.00767,-84.654241,-74.006377,0.0
25%,6.0,-73.992041,40.734996,-73.991215,40.734182,1.0
50%,8.5,-73.981789,40.752765,-73.98,40.753243,1.0
75%,12.5,-73.966982,40.767258,-73.963433,40.768166,2.0
max,200.0,40.787575,401.083332,40.851027,404.616667,6.0


In [13]:
print("Old N Rows: {}".format(len(df_train)))
#drop fare <= 0
df_train = df_train[df_train['fare_amount'] > 0]
print("New N Rows: {}".format(len(df_train)))

Old N Rows: 100000


In [20]:
print(df_train.isnull().sum())

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [21]:
print("Old N Rows: {}".format(len(df_train)))
#drop NaNs
df_train = df_train.dropna(how='any', axis='rows')
print("New N Rows: {}".format(len(df_train)))

Old N Rows: 99988
New N Rows: 99988


In [22]:
#read test data
TEST_FILE = './new-york-city-taxi-fare-prediction/test.csv'
df_test = pd.read_csv(TEST_FILE, parse_dates=['pickup_datetime'])

In [23]:
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1


In [24]:
df_test.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.252193,40.573143,-74.263242,40.568973,1.0
25%,-73.992501,40.736125,-73.991247,40.735254,1.0
50%,-73.982326,40.753051,-73.980015,40.754065,1.0
75%,-73.968013,40.767113,-73.964059,40.768757,2.0
max,-72.986532,41.709555,-72.990963,41.696683,6.0


In [38]:
def preprocess_df(df):
    #remove negative fare_amount
    df = df[df['fare_amount'] > 0]
    
    #remove missing values
    df.dropna(how='any', axis='rows')
    
    #remove location outliers, ensuring locations are within the range seen in the train set
    def select_within_boundingbox(df, BB=(-74.5, -72.8, 40.5, 41.8)):
        return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
               (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
               (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
               (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
    
    df = df[select_within_boundingbox(df)]
    
    #remove trips in the water
    def remove_datapoints_from_water(df):
        def lonlat_to_xy(longitude, latitude, dx, dy, BB):
            return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
                   (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

        # define bounding box
        BB = (-74.5, -72.8, 40.5, 41.8)

        # read nyc mask and turn into boolean map with
        # land = True, water = False
        nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9

        # calculate for each lon,lat coordinate the xy coordinate in the mask map
        pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)
        dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                          nyc_mask.shape[1], nyc_mask.shape[0], BB)    
        # calculate boolean index
        idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

        # return only datapoints on land
        return df[idx]
    
    df = remove_datapoints_from_water(df)
    
    #Step 2: preprocess features
    df['weekday'] = df.pickup_datetime.apply(lambda t: t.weekday) 
    df['month'] = df.pickup_datetime.apply(lambda t: t.month) - 1
    df['hour'] = df.pickup_datetime.apply(lambda t: t.hour)
    
    #drop pickup_datetime
    df.pop('pickup_datetime')
    #drop key
    df.pop('key')
    return df
    

In [39]:
TRAIN_FILE = './new-york-city-taxi-fare-prediction/train.csv'
df = pd.read_csv(TRAIN_FILE, nrows=10_000, parse_dates=['pickup_datetime'])
print(len(df))
df = preprocess_df(df)
print(len(df))

10000
9786


In [40]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,weekday,month,hour
count,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0,9786.0
mean,11.230906,-73.97469,40.7516,-73.973919,40.75228,1.646434,3.022583,6.196301,13.523299
std,9.512217,0.042859,0.032753,0.039331,0.033841,1.272349,1.974027,3.425802,6.497538
min,0.01,-74.438233,40.500046,-74.429332,40.500046,0.0,0.0,1.0,0.0
25%,6.0,-73.992251,40.736182,-73.991285,40.736762,1.0,1.0,3.0,9.0
50%,8.5,-73.982091,40.753456,-73.980515,40.754497,1.0,3.0,6.0,14.0
75%,12.5,-73.968285,40.768086,-73.965391,40.768603,2.0,5.0,9.0,19.0
max,180.0,-73.137393,41.366138,-73.137393,41.366138,6.0,6.0,12.0,23.0


In [41]:
df.dtypes

fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
weekday                int64
month                  int64
hour                   int64
dtype: object