In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df_train =  pd.read_csv('../Data/taxi_chicago_filter/part-00000-44c385a4-f36a-431e-80a9-ec676819f97e-c000.csv',
                        usecols = ['trip_id','pickup_centroid_latitude','pickup_centroid_longitude',
                                   'dropoff_centroid_latitude','dropoff_centroid_longitude'])

In [4]:
len(df_train)

22792581

In [5]:
# load image of Chigago map
BB = (-87.95, -87.5306, 42.03, 41.66)
chicago_map = plt.imread('../Data/maps/chicago_map.png')

In [None]:
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(16,10))
    axs[0].scatter(df.pickup_centroid_longitude, df.pickup_centroid_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    axs[0].imshow(nyc_map, zorder=0, extent=BB)

    axs[1].scatter(df.dropoff_centroid_longitude, df.dropoff_centroid_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    axs[1].imshow(nyc_map, zorder=0, extent=BB)

In [None]:
plot_on_map(df_train, BB, chicago_map, s=1, alpha=0.3)

In [None]:
# read nyc mask and turn into boolean map with
# land = True, water = False
chicago_mask = plt.imread('../Data/maps/chicago_edited_map.png')[:,:,0] > 0.8

plt.figure(figsize=(8,8))
plt.imshow(chicago_map, zorder=0)
plt.imshow(chicago_mask, zorder=1, alpha=1); # note: True is show in black, False in white.

In [None]:
# translate longitude/latitude coordinate into image xy coordinate
def lonlat_to_xy(longitude, latitude, dx, dy, BB):
    return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
           (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

In [None]:
pickup_x, pickup_y = lonlat_to_xy(df_train.pickup_centroid_longitude, df_train.pickup_centroid_latitude, 
                                  chicago_mask.shape[1], chicago_mask.shape[0], BB)
dropoff_x, dropoff_y = lonlat_to_xy(df_train.dropoff_centroid_longitude, df_train.dropoff_centroid_latitude, 
                                  chicago_mask.shape[1], chicago_mask.shape[0], BB)

In [None]:
idx = (chicago_mask[pickup_y, pickup_x] & chicago_mask[dropoff_y, dropoff_x])
print("Number of trips in water: {}".format(np.sum(idx)))

In [None]:
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # define bounding box
    BB = (-87.95, -87.5306, 42.03, 41.66)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('../Data/maps/chicago_edited_map.png')[:,:,0] > 0.8
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_centroid_longitude, df.pickup_centroid_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_centroid_longitude, df.dropoff_centroid_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[~idx]

In [None]:
len(df_train)

In [None]:
df_train1 = remove_datapoints_from_water(df_train)

In [None]:
len(df_train1)

In [None]:
plot_on_map(df_train1, BB, chicago_map, s=1, alpha=0.3)