In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# plt.rcParams['figure.figsize'] = (14,5)
# plt.rcParams["font.family"] = "monospace"
# plt.rcParams['axes.edgecolor'] = 'black'

# plt.rcParams['figure.frameon'] = True
# plt.rcParams['axes.linewidth'] = 1.5

import warnings
warnings.filterwarnings("ignore")

In [3]:
cols = ['fare_amount', 
        'pickup_datetime', 
        'pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude', 
        'passenger_count']

# Set columns to most suitable type to optimize for memory usage
dtypes = {'fare_amount': np.float32,
              'pickup_datetime': 'str', 
              'pickup_longitude': np.float32,
              'pickup_latitude': np.float32,
              'dropoff_longitude': np.float32,
              'dropoff_latitude': np.float32,
              'passenger_count': np.uint8}


In [4]:
# %%time
# df = pd.read_csv('train.csv', usecols=cols, dtype=dtypes, nrows=20_000_000)

Wall time: 1min 1s


In [5]:
# %%time
# df.to_feather('nyc_taxi_data_raw.feather')

Wall time: 7.7 s


In [6]:
del df

In [7]:
#!pip install pyarrow

In [8]:
%%time
train = pd.read_feather('nyc_taxi_data_raw.feather')

Wall time: 13.7 s


In [9]:
print('Train Size:', train.shape)

Train Size: (20000000, 7)


In [10]:
test = pd.read_csv('test.csv', dtype=dtypes)

In [11]:
print('Test Size:', test.shape)

Test Size: (9914, 7)


In [12]:
train['pickup_datetime'] = train['pickup_datetime'].str.replace('UTC', '')
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])

In [13]:
test['pickup_datetime'] = test['pickup_datetime'].str.replace('UTC', '')
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [14]:
train['day'] = train['pickup_datetime'].dt.day
train['hour'] = train['pickup_datetime'].dt.hour
train['year'] = train['pickup_datetime'].dt.year
train["day_of_week"] = train['pickup_datetime'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].apply(lambda x:1 if x>4 else 0)

In [15]:
for i in ['day', 'hour','day_of_week']:
    train[i] = train[i].astype(np.uint8)

In [16]:
train['year'] = train.year.astype(np.uint16)

In [17]:
test['day'] = test['pickup_datetime'].dt.day
test['hour'] = test['pickup_datetime'].dt.hour
test['year'] = test['pickup_datetime'].dt.year
test["day_of_week"] = test['pickup_datetime'].dt.dayofweek
test['is_weekend'] = test['day_of_week'].apply(lambda x:1 if x>4 else 0)

In [18]:
for i in ['day', 'hour','day_of_week']:
    test[i] = test[i].astype(np.uint8)

In [19]:
def is_night(x):
    if x >= 6 and x <= 20:
        return 0
    else:
        return 1

In [20]:
train['is_night'] = train.hour.apply(lambda x:is_night(x))

In [21]:
train['is_night'] = train.is_night.astype('bool')

In [22]:
train['is_weekend'] = train.is_weekend.astype('bool')

In [23]:
test['is_weekend'] = test.is_weekend.astype('bool')

In [24]:
test['is_night'] = test.hour.apply(lambda x:is_night(x))

In [25]:
test['is_night'] = test.is_night.astype(np.bool)

In [26]:
train.drop('pickup_datetime', axis=1, inplace=True)

In [27]:
#train.describe()

In [28]:
print('Number of negative fare amount:',train[train['fare_amount']<0].shape[0])

# -----------------------------------------
idx = train['fare_amount']>=2.5
train = train[idx] # Selecting rows >= $2.5 on fare amount

Number of negative fare amount: 832


In [29]:
# # create a an fare amount category attribute with five bins to understand better this attribute
train["fare_amount_hist"]=pd.cut(train["fare_amount"],
                            bins=[0., 6.0,12.,48.,150., np.inf],
                                  labels = [1,2,3,4,5])
# plt.figure(figsize=(16,4))
# ax = train["fare_amount_hist"].hist()

# ax.set_xticks(range(1,6), labels = [1,2,3,4,5])
# sns.despine()

In [30]:
train['fare_amount_hist'].value_counts()

2    9438851
1    5125317
3    5046647
4     385568
5       1936
Name: fare_amount_hist, dtype: int64

In [31]:
train = train[train['fare_amount_hist']<=4] # Selecting rows <= $150

In [32]:
train = train.drop('fare_amount_hist', axis=1)

In [33]:
# plt.figure(figsize=(16,4))
# sns.histplot(x='fare_amount',data=train,stat='count',kde=True,color='red')
# plt.ylabel('fare amount')
# sns.despine()
# plt.xlabel('');

In [34]:
coordinates_columns = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
for coord in coordinates_columns:
    maxi = train[coord].max()
    mini = train[coord].min()
    print ("Range of {} is : ({:.3f}, {:.3f})".format(coord, maxi, mini))

Range of pickup_latitude is : (3406.008, -3492.264)
Range of pickup_longitude is : (3457.626, -3439.245)
Range of dropoff_latitude is : (3400.392, -3493.652)
Range of dropoff_longitude is : (3457.622, -3442.025)


In [35]:
coordinates = {'min_long': min(test.pickup_longitude.min(), test.dropoff_longitude.min()),
              'max_long': max(test.pickup_longitude.max(), test.dropoff_longitude.max()),
              'min_lat': min(test.pickup_latitude.min(), test.dropoff_latitude.min()),
              'max_lat' : min(test.pickup_latitude.max(), test.dropoff_latitude.max()),}

In [36]:
train = train[(train.pickup_longitude >= coordinates['min_long']) & 
      (train.pickup_longitude <= coordinates['max_long']) & 
      (train.pickup_latitude >= coordinates['min_lat']) & 
      (train.pickup_latitude <= coordinates['max_lat']) &
      (train.dropoff_longitude >= coordinates['min_long']) &
      (train.dropoff_longitude <= coordinates['max_long']) &
      (train.dropoff_latitude >= coordinates['min_lat']) & 
      (train.dropoff_latitude<= coordinates['max_lat'])]

In [37]:
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # define bounding box
    BB = (-74.5, -72.8, 40.5, 41.8)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('nyc_map.jpg')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    idx = (nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x])
    print("Number of trips in water: {}".format(np.sum(~idx)))
    
    # return only datapoints on land
    return df[idx]

In [38]:
train = remove_datapoints_from_water(train)

Number of trips in water: 2015


In [39]:
# # we will use plt.xlim to limit the axes while plotting , to get a better observation of the data
# city_long_border = (-74.03, -73.75)
# city_lat_border = (40.63, 40.85)

# train.plot(kind ='scatter', x='pickup_longitude', y='pickup_latitude',s=.02, alpha =0.4)
# plt.ylim(city_lat_border)
# plt.xlim(city_long_border)
# sns.despine()

In [40]:
train['passenger_count'].value_counts()

1      13529615
2       2891521
5       1385286
3        860338
4        416450
6        414823
0         68935
9             5
208           4
7             4
8             3
129           1
Name: passenger_count, dtype: int64

In [41]:
# In test set there are no zero passengers.
idx = train['passenger_count'].between(1,6)
train = train[idx]

In [42]:
def dist(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude = map(np.radians, [pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude])
    dlon = dropoff_longitude - pickup_longitude
    dlat = dropoff_latitude - pickup_latitude
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = 6367 * c
    return distance

In [43]:
# There are extra charges if trip ends in 3 nearby aiports and 7 nearby counties from the NYC center,
# so these location points and there distances to pickup and dropoff points are key factors 

# Distances to nearby city center, airports, and other NY counties
def transform(data):
    # Distances to nearby airports, city center and other counties
    # By reporting distances to these points, the model can somewhat triangulate other locations of interest
    
    # city center
    nyc = (-74.0060, 40.7128)
    
#     # county
    Nassau = (-73.5594, 40.6546)
    Suffolk = (-72.6151, 40.9849)
    Westchester = (-73.7949, 41.1220)
    Rockland = (-73.9830, 41.1489)
    Dutchess = (-73.7478, 41.7784)
    Orange = (-74.3118, 41.3912)
    Putnam = (-73.7949, 41.4351) 

    # airport
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)
    
    
#     # county
    data['pickup_distance_to_center'] = dist(nyc[0], nyc[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_center'] = dist(nyc[0], nyc[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Nassau'] = dist(Nassau[0], Nassau[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Nassau'] = dist(Nassau[0], Nassau[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Suffolk'] = dist(Suffolk[0], Suffolk[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Suffolk'] = dist(Suffolk[0], Suffolk[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Westchester'] = dist(Westchester[0], Westchester[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Westchester'] = dist(Westchester[0], Westchester[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Rockland'] = dist(Rockland[0], Rockland[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Rockland'] = dist(Rockland[0], Rockland[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Dutchess'] = dist(Dutchess[0], Dutchess[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Dutchess'] = dist(Dutchess[0], Dutchess[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Orange'] = dist(Orange[0], Orange[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Orange'] = dist(Orange[0], Orange[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])


    # airports
    data['pickup_distance_to_jfk'] = dist(jfk[0], jfk[1],
                                         data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_jfk'] = dist(jfk[0], jfk[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_ewr'] = dist(ewr[0], ewr[1], 
                                          data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_ewr'] = dist(ewr[0], ewr[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_lgr'] = dist(lgr[0], lgr[1],
                                          data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_lgr'] = dist(lgr[0], lgr[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    # point distance
    data['distance'] = dist(data['pickup_longitude'], data['pickup_latitude'],
                            data['dropoff_longitude'], data['dropoff_latitude'])
    
    return data

# Apply to both train and test data      
train = transform(train)
test = transform(test)

In [44]:
for i in train.dtypes.index[12:-1]:
    train[i] = train[i].astype(np.float32)

In [46]:
# train.distance.hist(bins=50)
# plt.title('Distance Distribution.')
# sns.despine()

In [47]:
# sns.scatterplot(x='distance', y='fare_amount', data=train, alpha=0.4)
# sns.despine()

From this plot we notice:

<li>There are trips with zero distance but with a non-zero fare. Could this be trips from and to the same location? Predicting these fares will be difficult as there is likely not sufficient information in the dataset.

<li>There are some trips with >50 miles travel distance but low fare. Perhaps these are discounted trips? 
    
<li>The horizontal lines in the right plot might indicate again the fixed fare trips to/from JFK airport.

In [48]:
idx = train['distance']>=.005
train = train[idx]

In [49]:
train['distance'] = train.distance.astype(np.float16)

In [50]:
test['distance'] = test.distance.astype(np.float16)

In [51]:
def calculate_direction(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    direction = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return direction

In [52]:
train['direction'] = calculate_direction(train['pickup_latitude'].values, train['pickup_longitude'].values, 
                                   train['dropoff_latitude'].values , train['dropoff_longitude'].values) 

In [53]:
train['direction'] = train['direction'].astype(np.float32)

In [54]:
test['direction'] = calculate_direction(test['pickup_latitude'].values, test['pickup_longitude'].values, 
                                   test['dropoff_latitude'].values , test['dropoff_longitude'].values)

In [55]:
# cols = train.corrwith(train['fare_amount']).index
# values = train.corrwith(train['fare_amount']).values

# cols_subset = []
# for col, val in zip(cols, values):
#     if np.abs(val)<=0.05:
#         cols_subset.append(col)

In [56]:
# cols_subset.append('fare_amount')
# cols_subset.append('fare_amount_hist')
# cols_subset # Columns to remove

In [57]:
print(f'Train have {len(train.isna().sum())} missing values.')
train.dropna(inplace=True)

Train have 34 missing values.


In [58]:
# # # Convert them to radian so their range will shink from range of degrees ( 1 -> 360) to range of radians.

train['pickup_latitude'] = train['pickup_latitude'].apply(lambda x: np.radians(x))
train['pickup_longitude'] = train['pickup_longitude'].apply(lambda x: np.radians(x))
train['dropoff_latitude'] = train['dropoff_latitude'].apply(lambda x: np.radians(x))
train['dropoff_longitude'] = train['dropoff_longitude'].apply(lambda x: np.radians(x))

test['pickup_latitude'] = test['pickup_latitude'].apply(lambda x: np.radians(x))
test['pickup_longitude'] = test['pickup_longitude'].apply(lambda x: np.radians(x))
test['dropoff_latitude'] = test['dropoff_latitude'].apply(lambda x: np.radians(x))
test['dropoff_longitude'] = test['dropoff_longitude'].apply(lambda x: np.radians(x))

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19256775 entries, 0 to 19999999
Data columns (total 34 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   fare_amount                      float32
 1   pickup_longitude                 float64
 2   pickup_latitude                  float64
 3   dropoff_longitude                float64
 4   dropoff_latitude                 float64
 5   passenger_count                  uint8  
 6   day                              uint8  
 7   hour                             uint8  
 8   year                             uint16 
 9   day_of_week                      uint8  
 10  is_weekend                       bool   
 11  is_night                         bool   
 12  pickup_distance_to_center        float32
 13  dropoff_distance_to_center       float32
 14  pickup_distance_to_Nassau        float32
 15  dropoff_distance_to_Nassau       float32
 16  pickup_distance_to_Suffolk       float32
 17  dropof

## Model Implementation.

In [60]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import make_column_transformer

In [61]:
#import xgboost as xgb

In [62]:
X = train.drop(columns = 'fare_amount', axis=1).values
y = train['fare_amount'].values

In [63]:
del train

In [64]:
#X = np.round(X, 4)

In [65]:
X.shape

(19256775, 33)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
del X
del y

In [None]:
print('Training Set:', X_train.shape)
print('Test Set:', X_test.shape)

In [None]:
data_prep_pipe = make_pipeline(
    PowerTransformer())

In [None]:
# reg = HistGradientBoostingRegressor(max_iter=600, 
#                                     max_depth=20, 
#                                     scoring='neg_root_mean_squared_error' ,
#                                     verbose=1, 
#                                     random_state=0, 
#                                     min_samples_leaf=40)
# final_pipe = make_pipeline(
#                 data_prep_pipe, 
#                 reg )

In [None]:
# cv = RepeatedKFold(n_splits=5, 
#                       n_repeats=3, 
#                       random_state=0)

# results = cross_val_score(final_pipe, 
#                           X_train, 
#                           y_train, 
#                           n_jobs=-1, 
#                           scoring='neg_root_mean_squared_error')

In [None]:
# print(np.abs(np.mean(results).round(3)))

# # 3.151 on 2M Samples.

In [None]:
#final_pipe.fit(X_train, y_train)

In [None]:
# y_pred_train = final_pipe.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, y_pred_train)))

#3.0698676128442157

In [None]:
# key = test['key']

# test = test.drop(columns = ['fare_amount', 'fare_amount_hist'], axis=1, errors='ignore')
# test.drop(['key', 'pickup_datetime'], axis=1,  errors='ignore', inplace=True)

In [None]:
# y_pred_test = final_pipe.predict(test)
# holdout = pd.DataFrame({'key':key,'fare_amount': y_pred_test})

In [None]:
# holdout.to_csv('submission.csv', index=False)