In [1]:
import pandas as pd
import numpy as np
import sys 
import datetime
import os
import matplotlib as plt
import seaborn as sns

%matplotlib inline

# Process the Weekday data

## Averages data for Jan and Feb 2019

* Average weekday non-pooled trip totals 
* Average weekday pooled trip totals
* Average weekday fares
* Average weekday travel time


In [41]:
tods = [1,2,3,4,5]

In [60]:
agg = { 'Trip Seconds':'mean',
       'Trip Miles':'mean','Fare':'mean',
       'Tip':'mean', 'Additional Charges':'mean', 'Trip Total':'mean', 'PRIVATE_TRIPS':'sum','SHARED_TRIPS':'sum',
       'Trips Pooled':'sum'}


In [4]:
df_all = pd.DataFrame()
df_raw = pd.DataFrame()

for tod in tods:
    print('Working on tod ' + str(tod))
    df = pd.read_hdf('C:/Workspace/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Old Files/Chicago_TNC_Trips_20_Incomplete.H5', where = 'YEAR == 2019', key = 'Weekday_' + str(tod))

    #df = df[df['MONTH'].isin([1,2])]
    #df = df[df['YEAR']==2019]
    
    print(df['Trip Seconds'].max())
    print(df['Trip Miles'].max())
    
    df = df.dropna(subset = ['Pickup Census Tract', 'Dropoff Census Tract'])
    df['DAY'] = df['Trip Start Timestamp'].dt.day
    df['PRIVATE_TRIPS'] = np.where(df['Shared Trip Authorized'] == False, 1, 0)
    df['SHARED_TRIPS'] = np.where(df['Shared Trip Authorized'] == True, 1, 0)  
    
    ## the filters are higher than the longest trip because the trip could be pooled
    # filter out the trips that are longer than 50 miles becuase the longest possible trip is 35 miles going from O'Hare airport to south east corner
    df2 = df[df['Trip Miles'] <= 50]
    
    # filter out the trips that are longer than 2 hours becuase the longest possible trip is 1 hours going from O'Hare airport to south east corner
    df2 = df2[df2['Trip Seconds'] <= 7200]
    
    print(df2['Trip Seconds'].max())
    print(df2['Trip Miles'].max())
    
    #census tract XXX replaces the trips assigned to census tract XXX because they are likely misasigned. Census tract XXX contains
    #contains O'Hare airport and census tract is adjacent. The trips assigned to census tract XXX are likely trips from the airport.
    df2.loc[df2['Pickup Census Tract'] == 17031770700, 'Pickup Census Tract'] = 17031980000
    df2.loc[df2['Dropoff Census Tract'] == 17031770700, 'Dropoff Census Tract'] = 17031980000
    
    
    #trips to census tract XXX are likely misasigned but it is not clear which census tract they should be assigned to.
    #there is only one trip, so it is droped
    df2 = df2[df2['Pickup Census Tract'] != 17031810502]
    df2 = df2[df2['Dropoff Census Tract'] != 17031810502]

    
    df2 = df2.groupby(by= ['Pickup Census Tract','Dropoff Census Tract','MONTH','DAY'], as_index =False).agg(agg)
    df2 = df2.groupby(by= ['Pickup Census Tract','Dropoff Census Tract'], as_index =False).mean()
    
    df['TOD'] = tod
    df2['TOD'] = tod

    df_raw = df_raw.append(df)
    df_all = df_all.append(df2)

Working on tod 1
55117.0
296.9
7158.0
49.8
Working on tod 2
29520.0
205.2
7093.0
48.5
Working on tod 3
78780.0
266.1
7198.0
49.5
Working on tod 4
27660.0
335.5
7003.0
49.8
Working on tod 5
80520.0
304.7
7186.0
49.6


## Add in the Suppressed Trips

In [5]:
sup_trips = pd.read_csv('C:/Workspace/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/2019 Suppressed Trips.csv')

In [8]:
df_final = df_all.merge(sup_trips[['GEOID_PICKUP', 'GEOID_DROPOFF', 'SCALED_SUP_PRIVATE_TRIPS', 'SCALED_SUP_SHARED_TRIPS', 'TOD']], how = 'left', left_on = ['Pickup Census Tract','Dropoff Census Tract', 'TOD'],right_on = ['GEOID_PICKUP', 'GEOID_DROPOFF', 'TOD'])

In [10]:
df_final['SHARED_TRIPS'] = df_final['SHARED_TRIPS'] + df_final['SCALED_SUP_SHARED_TRIPS']
df_final['PRIVATE_TRIPS'] = df_final['PRIVATE_TRIPS'] + df_final['SCALED_SUP_PRIVATE_TRIPS']
df_final['ALL_TRIPS'] = df_final['PRIVATE_TRIPS'] + df_final['SHARED_TRIPS']

# Merge RH Data to Empty Chicago OD Matrix

In [13]:
empty = pd.read_csv('Inputs/Chicago Ride-Hailing/Empty_Chicago_Matrix.csv')

In [33]:
# this is a way to convert the chicago data file tract column from a float to a int.. save for later

#test = df_final['Pickup Census Tract'].astype(str)

#int(test[0][:-2])

In [19]:
empty.DESTINATION = empty.DESTINATION.astype(float)
empty.ORIGIN = empty.ORIGIN.astype(float)

In [21]:
rh_final = empty.merge(df_final, how = 'left', left_on = ['ORIGIN', 'DESTINATION','TOD'], right_on = ['Pickup Census Tract', 'Dropoff Census Tract', 'TOD'])

In [37]:
rh_final = rh_final.fillna(0)

In [None]:
np.sqrt(rh_final.TOD.value_counts())

In [38]:
rh_final.to_csv('Outputs/2019_Weekday_Ridehail.csv')


In [None]:
df_raw.to_csv('Inputs/Chicago Ride-Hailing/2019_Trip_Records_No_Suppressed.csv')

# Data Accuracy Check

In [None]:
df = pd.read_hdf('C:/Workspace/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Chicago_TNC_Trips_20.H5', key = 'Weekday_' + str(3))

In [14]:
jan_feb = df[df['MONTH'].isin([1,2])]

In [16]:
miles_over_50 = jan_feb[jan_feb['Trip Miles'] > 50]

In [17]:
len(miles_over_50)

77

In [18]:
miles_over_50.to_csv('Trips_Over_50_Miles.csv')

In [19]:
fare_0 = jan_feb[jan_feb['Fare'] == 0]

In [39]:
fare_0.to_csv('Fare_0_trips.csv')

In [41]:
jan_feb[(jan_feb['Trip Start Timestamp'] == '2019-01-15 08:30:00')&(jan_feb['Pickup Census Tract'] == 17031839700)&(jan_feb['Dropoff Census Tract'] == 17031320100)]

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Fare,Tip,Additional Charges,Trip Total,Shared Trip Authorized,Trips Pooled,YEAR,MONTH,DOW,HOUR
9133667,2019-01-15 08:30:00,2019-01-15 09:00:00,1679.0,5.2,17031840000.0,17031320000.0,0.0,0.0,0.67,0.67,True,2.0,2019,1,1,8


In [42]:
miles_over_50.head()

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Fare,Tip,Additional Charges,Trip Total,Shared Trip Authorized,Trips Pooled,YEAR,MONTH,DOW,HOUR
9285742,2019-01-24 08:15:00,2019-01-24 10:30:00,8068.0,58.1,17031320000.0,17031840000.0,152.5,0.0,2.55,155.05,False,1.0,2019,1,3,8
9764265,2019-01-01 08:45:00,2019-01-01 10:45:00,7752.0,90.7,17031080000.0,17031080000.0,112.5,21.0,5.5,139.0,False,1.0,2019,1,1,8
10872560,2019-02-21 07:45:00,2019-02-21 09:45:00,7645.0,51.7,17031840000.0,17031840000.0,80.0,11.0,3.3,94.3,False,1.0,2019,2,3,7
11963121,2019-02-01 08:15:00,2019-02-01 10:00:00,6133.0,53.9,17031840000.0,17031280000.0,75.0,0.0,2.55,77.55,False,1.0,2019,2,4,8
12646541,2019-01-25 08:30:00,2019-01-25 10:30:00,7135.0,83.4,17031080000.0,17031830000.0,102.5,0.0,2.55,105.05,False,1.0,2019,1,4,8


In [45]:
jan_feb[jan_feb['Trip Total'] == 0]

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Fare,Tip,Additional Charges,Trip Total,Shared Trip Authorized,Trips Pooled,YEAR,MONTH,DOW,HOUR
30217883,2019-01-28 07:30:00,2019-01-28 07:45:00,959.0,2.0,17031080000.0,17031070000.0,0.0,0.0,0.0,0.0,True,6.0,2019,1,0,7


In [44]:
len(jan_feb[jan_feb['Fare'] == 0])

32806

In [47]:
trips_5hrs_moore = jan_feb[jan_feb['Trip Seconds'] >= 18000]

In [48]:
trips_5hrs_moore.to_csv('trips_over_5hrs.csv')