In [25]:
## this files reads the source data, finds if pickup or drop off locations are next to NYC airports.
## Creates three additional fields with airport related tags and save these 3 fields in a file
## the distance calculation funcion is defined in the helper.ipynb/py file

In [1]:
import pandas as pd
import time
import datetime

import helper

In [3]:
## load the source data
file_name = '../data/green_tripdata_2015-09.csv'
df = pd.read_csv(file_name)
print(df.shape)

(1494926, 21)


In [5]:
## defind airport paramenters
# first using google maps, find airport lat long and a radius 
# all points that falls with the radius from the airport location will be tag for that airport

airports = ["JFK", "LGA", "EWR"]

lat = {}
lon = {}
radius = {}  

lat["JFK"] = 40.647814
lon["JFK"] = -73.786415
radius["JFK"] = 1  # mile

lat["LGA"] = 40.777096
lon["LGA"] = -73.863381
radius["LGA"] = 1.1/1.6  # mile

lat["EWR"] = 40.692335
lon["EWR"] = -74.182119
radius["EWR"] = 0.5  # mile

pickdrop = ["Pickup", "Dropoff"]
tdf =pd.DataFrame()

In [7]:
## find distance from airports
for a in airports:
    for p in pickdrop:
        tstart = time.time()
        
        df_field = p + "_to_" + a
        lat_field = p + "_latitude"
        lon_field = p + "_longitude"
        tdf[df_field] = df.apply(lambda r: helper.find_distance(r[lat_field], r[lon_field], lat[a], lon[a]), axis =1)
        print("Time for " + a + " " + p, str(datetime.timedelta(seconds=time.time()-tstart)))

Time for JFK Pickup 0:02:48.722032
Time for JFK Dropoff 0:02:48.277461
Time for LGA Pickup 0:02:48.923894
Time for LGA Dropoff 0:02:47.610286
Time for EWR Pickup 0:02:48.122049
Time for EWR Dropoff 0:02:48.795221


In [11]:
# if with the radius, points are near the airport
near_airport = {}
for a in airports:
    for p in pickdrop:
        near_airport[p + "_to_" + a] = [ d < radius[a] for d in tdf[p + "_to_" + a] ]

In [12]:
## hold the data in airport_tag
airport_tag = [None]*df.shape[0]

## a small function to create airport tags
def create_tags(x,a,p,tf):
    if tf:
        if (x == None):
            return a + p
        else:
            return x + " " + a + p
    else:
        return x

In [13]:
## Create detailed tags
for a in airports:
    for p in pickdrop:
        airport_tag = [ create_tags(x,a,"_"+p,y) for x,y in zip(airport_tag,near_airport[p + "_to_" + a]) ]

In [14]:
## check the results
pd.DataFrame(airport_tag)[0].value_counts()

LGA_Dropoff               20736
JFK_Dropoff               12738
EWR_Dropoff                 669
LGA_Pickup LGA_Dropoff      209
JFK_Pickup JFK_Dropoff      200
LGA_Pickup                  145
JFK_Pickup                   87
EWR_Pickup EWR_Dropoff       39
JFK_Dropoff LGA_Pickup       20
JFK_Pickup LGA_Dropoff       18
EWR_Pickup                    2
LGA_Pickup EWR_Dropoff        1
Name: 0, dtype: int64

In [15]:
## Create tag for pickup and drop off locations
pickup_tag = [None]*df.shape[0]
dropoff_tag = [None]*df.shape[0]

for a in airports:
    pickup_tag = [ create_tags(x,a,"",y) for x,y in zip(pickup_tag,near_airport["Pickup_to_" + a]) ]
    dropoff_tag = [ create_tags(x,a,"",y) for x,y in zip(dropoff_tag,near_airport["Dropoff_to_" + a]) ]

In [22]:
## store results in a data frame
df_airport = pd.DataFrame()
df_airport["Airport_tag"] = airport_tag
df_airport["Pickup_tag"] = pickup_tag
df_airport["Dropoff_tag"]= dropoff_tag

## fillup missing values as 'Non Airport'
for col in list(df_airport):
  df_airport[col] = df_airport[col].fillna("Non Airport")

In [23]:
df_airport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494926 entries, 0 to 1494925
Data columns (total 3 columns):
Airport_tag    1494926 non-null object
Pickup_tag     1494926 non-null object
Dropoff_tag    1494926 non-null object
dtypes: object(3)
memory usage: 34.2+ MB


In [24]:
file_name = '../data/green_tripdata_2015-09_airport.csv'
df_airport.to_csv(file_name, index=False)