In [None]:
## This file adds geohash tags to the pickup and dropoff locations 
## Three different precision levels (5, 7, 9) are used to get three different grid sizes. 
## The ones corresponding to precision 5 or 7 could be further grouped into bins and
## used in tip percent prediction model for Q4.
## The ones with precision 7 or 9 could be used in map visualizations in Q5.
## The hashed valued are written in the green_tripdata_2015-09_geo.csv file in the data folder
## and the combine_data.ipynb file combines them with the rest.

In [1]:
import pandas as pd
import pygeohash as pgh

In [2]:
file_name_source  = '../data/green_tripdata_2015-09.csv'
df = pd.read_csv(file_name_source)
print(df.shape)

(1494926, 21)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494926 entries, 0 to 1494925
Data columns (total 21 columns):
VendorID                 1494926 non-null int64
lpep_pickup_datetime     1494926 non-null object
Lpep_dropoff_datetime    1494926 non-null object
Store_and_fwd_flag       1494926 non-null object
RateCodeID               1494926 non-null int64
Pickup_longitude         1494926 non-null float64
Pickup_latitude          1494926 non-null float64
Dropoff_longitude        1494926 non-null float64
Dropoff_latitude         1494926 non-null float64
Passenger_count          1494926 non-null int64
Trip_distance            1494926 non-null float64
Fare_amount              1494926 non-null float64
Extra                    1494926 non-null float64
MTA_tax                  1494926 non-null float64
Tip_amount               1494926 non-null float64
Tolls_amount             1494926 non-null float64
Ehail_fee                0 non-null float64
improvement_surcharge    1494926 non-null float64
To

In [4]:
## En code pickup and dropoff locations with geohash in 3 precisions 5,7,9
pdstr = ["Pickup", "Dropoff"]
for n in [5,7,9]:
    for p in pdstr:
        cname = p + "_geo_" + str(n)
        th = [ pgh.encode(x,y,precision=n) for x,y in zip(df[p + "_latitude"],df[p + "_longitude"])]
        df[cname] = th
        tlatlon   = [pgh.decode(x) for x in th]
        cname     = p + "_lat_" + str(n)
        df[cname] = [ x[0] for x in tlatlon]
        cname     = p + "_lon_" + str(n)
        df[cname] = [ x[1] for x in tlatlon]

In [5]:
## keep only the geo variables
for x in list(df):
    if (x.split("_")[0] not in set(pdstr)):
        del df[x]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494926 entries, 0 to 1494925
Data columns (total 22 columns):
Pickup_longitude     1494926 non-null float64
Pickup_latitude      1494926 non-null float64
Dropoff_longitude    1494926 non-null float64
Dropoff_latitude     1494926 non-null float64
Pickup_geo_5         1494926 non-null object
Pickup_lat_5         1494926 non-null float64
Pickup_lon_5         1494926 non-null float64
Dropoff_geo_5        1494926 non-null object
Dropoff_lat_5        1494926 non-null float64
Dropoff_lon_5        1494926 non-null float64
Pickup_geo_7         1494926 non-null object
Pickup_lat_7         1494926 non-null float64
Pickup_lon_7         1494926 non-null float64
Dropoff_geo_7        1494926 non-null object
Dropoff_lat_7        1494926 non-null float64
Dropoff_lon_7        1494926 non-null float64
Pickup_geo_9         1494926 non-null object
Pickup_lat_9         1494926 non-null float64
Pickup_lon_9         1494926 non-null float64
Dropoff_geo_9     

In [10]:
file_name = '../data/green_tripdata_2015-09_geo.csv'
df.to_csv(file_name, index=False)