In [None]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import geopandas as gp
import numpy as np
pd.set_option('display.max_columns', 500)

In [None]:
samp = pd.read_csv('C:/Workspace/TNC-Demand-Model/Outputs/Sample.csv')

In [None]:
samp = samp[samp['TRAVEL_TIME_MINUTES'] <= 120]
samp = samp[samp['TRIP_LENGTH_MILES'] <= 50]

In [None]:
#drop the private trips that have more than one trip pooled because I dont know what is going on there (~0.1% of all trips)
drop = samp[(samp['NUM_TRIPS_POOLED'] > 1)&(samp['SHARED_FLAGGER'] == 0)]['TRAVEL_TIME_MINUTES'].index
samp = samp.drop(drop)

In [None]:
samp['PRIVATE_TRIP'] = np.where(samp['SHARED_FLAGGER'] == 0, 1,0)
samp['SHARED_TRIP'] = np.where(samp['SHARED_FLAGGER'] == 1, 1,0)

In [None]:
samp['DAY'] = pd.to_datetime(samp['TRIP_START_TIME']).dt.day

In [None]:
samp['MINUTE'] = pd.to_datetime(samp['TRIP_START_TIME']).dt.minute

In [None]:
agg = { 'INTERNAL_SUPPPRESSED_FLAGGER':'sum','MATCHED_TRIP':'sum','UNMATCHED_TRIP':'sum',
       'SHARED_FARE':'mean','PRIVATE_FARE':'mean','TRAVEL_TIME_MINUTES':'mean', 'PRIVATE_TRIP_LENGTH':'mean','SHARED_TRIP_LENGTH':'mean','TRIP_LENGTH_MILES':'mean',  'FARE':'mean', 'TIP':'mean', 'ADDITIONAL_CHARGES':'mean', 'TOTAL_COST':'mean',
        'NUM_TRIPS_POOLED':'max', 'SPEED':'mean','PRIVATE_TRIP':'sum','SHARED_TRIP':'sum','PRIVATE_TRAVEL_TIME':'mean','SHARED_TRAVEL_TIME':'mean','TOD_1':'first','TOD_2':'first','TOD_3':'first','TOD_4':'first','TOD_5':'first'}

In [None]:
samp['PRIVATE_TRIP_LENGTH'] = np.where(samp['SHARED_FLAGGER'] == 0, samp['TRIP_LENGTH_MILES'], np.nan)
samp['SHARED_TRIP_LENGTH'] = np.where(samp['SHARED_FLAGGER'] == 1, samp['TRIP_LENGTH_MILES'], np.nan)

In [None]:
samp['PRIVATE_FARE'] = np.where(samp['SHARED_FLAGGER'] == 0, samp['FARE'], np.nan)
samp['SHARED_FARE'] = np.where(samp['SHARED_FLAGGER'] == 1, samp['FARE'], np.nan)

In [None]:
samp['PRIVATE_TRAVEL_TIME'] = np.where(samp['SHARED_FLAGGER'] == 0,samp['TRAVEL_TIME_MINUTES'],np.nan)
samp['SHARED_TRAVEL_TIME'] = np.where(samp['SHARED_FLAGGER'] == 1,samp['TRAVEL_TIME_MINUTES'],np.nan)

In [None]:
#samp['PRIVATE_TRIP_LENGTH'] = np.where(samp['SHARED_FLAGGER'] == 0,samp['TRAVEL_TIME_MINUTES'],np.nan)
#samp['SHARED_TRIP_LENGTH'] = np.where(samp['SHARED_FLAGGER'] == 1,samp['TRAVEL_TIME_MINUTES'],np.nan)

In [None]:
samp['MATCHED_TRIP'] = np.where(samp['NUM_TRIPS_POOLED'] > 1, 1, 0)
samp['UNMATCHED_TRIP'] = np.where((samp['SHARED_FLAGGER'] == 1)&(samp['NUM_TRIPS_POOLED'] == 1), 1, 0)


In [None]:
#do not aggregate and use the estimated private travel time to estimate matched shared travel time
#df2 = samp[['MATCHED_TRIP','UNMATCHED_TRIP','PRIVATE_TRIP_LENGTH','SHARED_TRIP_LENGTH','ORIGIN','DESTINATION','MONTH','YEAR','DAY','HOUR','MINUTE','TRAVEL_TIME_MINUTES', 'TRIP_LENGTH_MILES','FARE', 'TIP', 'ADDITIONAL_CHARGES', 'TOTAL_COST','SPEED','NUM_TRIPS_POOLED','PRIVATE_TRIP','SHARED_TRIP','PRIVATE_TRAVEL_TIME','SHARED_TRAVEL_TIME','SHARED_FARE','PRIVATE_FARE','TOD_1','TOD_2',"TOD_3","TOD_4",'TOD_5','INTERNAL_SUPPPRESSED_FLAGGER']].groupby(by = ['ORIGIN','DESTINATION','MONTH','YEAR','DAY','HOUR','MINUTE'], as_index = False).agg(agg)

In [None]:
results = smf.ols('SHARED_TRIP_LENGTH ~ PRIVATE_TRIP_LENGTH - 1', data=df4).fit()
print(results.summary())

In [None]:
results = smf.ols('SHARED_TRIP_LENGTH ~ PRIVATE_TRIP_LENGTH - 1', data=df5).fit()
print(results.summary())

In [None]:
df3['TRAVEL_TIME_SH_PV_DIFF'] = df3['SHARED_TRAVEL_TIME'] - df3['PRIVATE_TRAVEL_TIME']

In [None]:
df3['TRAVEL_TIME_SH_PV_DIFF'].mean()

In [None]:
df3['TRAVEL_TIME_SH_PV_DIFF'].std()

In [None]:
sns.displot(df3['TRAVEL_TIME_SH_PV_DIFF'])

In [None]:
pooled_df = df3[df3['NUM_TRIPS_POOLED'] > 1]

In [None]:
pooled_df['TRAVEL_TIME_MA_SH_DIFF'] = pooled_df['SHARED_TRAVEL_TIME'] - pooled_df['PRIVATE_TRAVEL_TIME']

In [None]:
pooled_df['TRAVEL_TIME_MA_SH_DIFF'].value_counts(dropna = False)

In [None]:
pooled_df['TRAVEL_TIME_MA_SH_DIFF'].mean()

In [None]:
pooled_df['TRAVEL_TIME_MA_SH_DIFF'].std()

In [None]:
sns.displot(pooled_df['TRAVEL_TIME_MA_SH_DIFF'])

In [None]:
results = smf.ols('SHARED_TRIP_LENGTH ~ PRIVATE_TRIP_LENGTH - 1', data=df3).fit()
print(results.summary())

In [None]:
results = smf.ols('SHARED_TRAVEL_TIME ~ PRIVATE_TRAVEL_TIME', data=df3).fit()
print(results.summary())

In [None]:
results = smf.ols('SHARED_TRAVEL_TIME ~ PRIVATE_TRAVEL_TIME', data=pooled_df).fit()
print(results.summary())

In [None]:
results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~ PRIVATE_TRAVEL_TIME', data=pooled_df).fit()
print(results.summary())

In [None]:
# add in density at the origin and destination as two separate variables
results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~ TRIP_LENGTH_MILES', data=df3).fit()
print(results.summary())

In [None]:
acs = pd.read_csv('C:/Workspace/TNC-Demand-Model/Outputs/Chicago_ACS_Data.csv')

In [None]:
area = gp.read_file('C:/Workspace/TNC-Demand-Model/Inputs/Census Shapefiles/Illinois/Chicago Tracts/geo_export_558aad9f-98d8-4dd5-a6b1-c1730155d596.shp')

In [None]:
area = area.to_crs('EPSG:26971')

In [None]:
area['AREA_SQ_MI'] = area.area/2590000

In [None]:
area['CENSUS_TRACT'] = area.geoid10.astype(float)

In [None]:
acs_area = acs_area.groupby(by = ['YEAR','MONTH','CENSUS_TRACT'], as_index = False).mean()

In [None]:
acs_area = acs.merge(area, how = 'left', on = 'CENSUS_TRACT')

In [None]:
acs_area['POP_DEN'] = acs_area['TOTAL_POP']/acs_area['AREA_SQ_MI']

In [None]:
acs_area['HHLDS_DEN'] = acs_area['TOTAL_HHLDS']/acs_area['AREA_SQ_MI']

In [None]:
df3 = df3.merge(acs_area[['CENSUS_TRACT','POP_DEN','HHLDS_DEN','YEAR','MONTH']], how = 'left', left_on = ['ORIGIN','MONTH','YEAR'], right_on = ['CENSUS_TRACT','MONTH','YEAR'])

In [None]:
df3 = df3.merge(acs_area[['CENSUS_TRACT','POP_DEN','HHLDS_DEN','YEAR','MONTH']], how = 'left', left_on = ['DESTINATION','MONTH','YEAR'], right_on = ['CENSUS_TRACT','MONTH','YEAR'], suffixes = ('_ORIGIN','_DESTINATION'))

In [None]:
#try densisty variables at the origin and destination
results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~ TRIP_LENGTH_MILES + POP_DEN_ORIGIN + POP_DEN_DESTINATION', data=df3).fit()
print(results.summary())

In [None]:
#try using the matched trips with difference between matched and unmatched travel time
results = smf.ols('TRAVEL_TIME_MA_SH_DIFF ~  TRIP_LENGTH_MILES + POP_DEN_ORIGIN + POP_DEN_DESTINATION', data=pooled_df).fit()
print(results.summary())

In [None]:
area2 = area.merge(df3, how = 'left', left_on = 'CENSUS_TRACT', right_on = 'DESTINATION')
area2.to_file('C:/Workspace/TNC-Demand-Model/Outputs/TT_Diff_Map_Dest.shp', driver = 'ESRI Shapefile')

In [None]:
test = sns.scatterplot(x = df3['PRIVATE_TRAVEL_TIME'], y = df3['TRAVEL_TIME_SH_PV_DIFF'])
                       
#test.set_ylim(0,100)
#test.set_xlim(0,100)

In [None]:
test = sns.scatterplot(x = df3['TRIP_LENGTH_MILES'], y = df3['TRAVEL_TIME_SH_PV_DIFF'])
                       
#test.set_ylim(0,100)
#test.set_xlim(0,100)

In [2]:
test = sns.scatterplot(x = df3['POP_DEN_ORIGIN'], y = df3['TRAVEL_TIME_SH_PV_DIFF'])
                       
#test.set_ylim(0,100)
#test.set_xlim(0,100)

NameError: name 'df3' is not defined

In [None]:
test = sns.scatterplot(x = df3['POP_DEN_ORIGIN'], y = pooled_df['TRAVEL_TIME_MA_SH_DIFF'])
                       
#test.set_ylim(0,100)
#test.set_xlim(0,100)

In [None]:
df3[df3['TRAVEL_TIME_SH_PV_DIFF'] > 20]

In [None]:
df3[df3['TRAVEL_TIME_SH_PV_DIFF'] > 20][['PRIVATE_TRAVEL_TIME', 'SHARED_TRAVEL_TIME','PRIVATE_TRIP_LENGTH','SHARED_TRIP_LENGTH','NUM_TRIPS_POOLED','PRIVATE_SPEED','SHARED_SPEED']]

In [None]:
df3['PRIVATE_SPEED'] = df3['PRIVATE_TRIP_LENGTH']/(df3['PRIVATE_TRAVEL_TIME']/60)

In [None]:
df3['SHARED_SPEED'] = df3['SHARED_TRIP_LENGTH']/(df3['SHARED_TRAVEL_TIME']/60)

In [None]:
df3[df3['TRAVEL_TIME_SH_PV_DIFF'] < -20][['PRIVATE_TRAVEL_TIME','SHARED_TRAVEL_TIME','SHARED_TRIP_LENGTH','PRIVATE_TRIP_LENGTH','PRIVATE_SPEED','SHARED_SPEED','INTERNAL_SUPPPRESSED_FLAGGER']]

In [None]:
samp[(samp['ORIGIN'] == 17031150800)&(samp['DESTINATION'] == 17031151200)&(samp['YEAR'] == 2018)&(samp['MONTH'] == 11)]

In [None]:
#try using the matched trips with difference between matched and unmatched travel time
results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~  + 1', data=pooled_df).fit()
print(results.summary())

In [None]:
len(df2[df2['NUM_TRIPS_POOLED'] > 1])/len(df2[df2['SHARED_TRIP'] == 1])

In [None]:
len(df2[df2['NUM_TRIPS_POOLED'] > 1])/len(df2)

In [None]:
pooled_df['TRIP_LENGTH_MILES_1'] = np.where(pooled_df['TOD_1'] == 1, pooled_df['TRIP_LENGTH_MILES'],0)
pooled_df['TRIP_LENGTH_MILES_2'] = np.where(pooled_df['TOD_2'] == 1, pooled_df['TRIP_LENGTH_MILES'],0)
pooled_df['TRIP_LENGTH_MILES_3'] = np.where(pooled_df['TOD_3'] == 1, pooled_df['TRIP_LENGTH_MILES'],0)
pooled_df['TRIP_LENGTH_MILES_4'] = np.where(pooled_df['TOD_4'] == 1, pooled_df['TRIP_LENGTH_MILES'],0)
pooled_df['TRIP_LENGTH_MILES_5'] = np.where(pooled_df['TOD_5'] == 1, pooled_df['TRIP_LENGTH_MILES'],0)

In [None]:

results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~ TRIP_LENGTH_MILES + TRIP_LENGTH_MILES_2 + TRIP_LENGTH_MILES_3 + TRIP_LENGTH_MILES_4 + TRIP_LENGTH_MILES_5 - 1', data=pooled_df).fit()
print(results.summary())

In [None]:
#LEAVE OUT TOD ASPECT
results = smf.ols('TRAVEL_TIME_SH_PV_DIFF ~ TRIP_LENGTH_MILES  + TOD_2 + TOD_3 + TOD_4 + TOD_5 ', data=pooled_df).fit()
print(results.summary())