In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import numpy as np

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

import gcsfs
fs = gcsfs.GCSFileSystem()



# Collating Big Stop Table

In [2]:
# read in and concatenate all geoparquets
fs_list = fs.ls(f"{GCS_FILE_PATH}tool_data/")

#fs_list[1:]

In [3]:
filelist = []
for f in fs_list[1:]:
    try: 
        test_pqt = pd.read_parquet(f"gs://{f}")
        filelist.append(f)
    except:
        print(f"error on {f.split('tool_data/')}")
        
#filelist

In [4]:
df_wkd = (pd.concat(
        pd.read_parquet(f"gs://{f}")
        for f in filelist
        )
           >> filter(_.daytype=="Weekday")
          )

In [5]:
df_wkd.head()

Unnamed: 0,calitp_itp_id,stop_id,stop_name,n_trips,n_routes,daytype,analysis_date,route_list_string,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_youth_pop,sum_seniors_pop,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs,land_area_sqkm,pop_density,job_density,pct_not_us_citizen_pop,pct_youth_pop,pct_seniors_pop,pct_poverty,pct_pop_workers_no_car,pct_hh_no_cars,itp_id,name,caltrans_district,ntd_agency_info_key
0,101,2382242,Cimmarron Rd at Cambridge Rd,12,1,Weekday,2022-11-02,"1964,1964,1964,1964,1964,1964,1964,1964,1964,1...",3,14920.0,5273.0,595.0,4645.0,2696.0,14914.0,1201.0,27.0,55.0,70110648.0,924.0,70.110648,212.806477,13.179168,3.987936,31.132708,18.069705,8.052836,0.180965,1.043049,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx
1,101,2456753,Pony Express Trail at Crystal Springs (East),11,1,Weekday,2022-11-02,"1961,1961,1961,1961,1961,1961,1961,1961,1961,1...",3,10744.0,4210.0,160.0,2678.0,2377.0,10718.0,1996.0,56.0,151.0,108093351.0,1326.0,108.093351,99.395568,12.267175,1.489203,24.92554,22.123976,18.622877,0.521221,3.586698,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx
2,101,2525339,Raley's (Placerville Dr),9,1,Weekday,2022-11-02,196819681968196819681968196819681968,1,5908.0,2366.0,321.0,1382.0,1279.0,5772.0,847.0,54.0,148.0,26949799.0,3876.0,26.949799,219.222414,143.822965,5.433311,23.392011,21.648612,14.67429,0.914015,6.255283,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx
3,101,2557119,Pony Express Trail at Blair Rd. (West),12,1,Weekday,2022-11-02,"1961,1961,1961,1961,1961,1961,1961,1961,1961,1...",2,7254.0,2886.0,61.0,1886.0,1495.0,7228.0,1629.0,35.0,120.0,61573208.0,734.0,61.573208,117.81098,11.920769,0.840915,25.999449,20.609319,22.537355,0.482492,4.158004,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx
4,101,2562503,Ray Lawyer Drive Park and Ride,4,1,Weekday,2022-11-02,1965196519651965,2,11384.0,4229.0,356.0,2781.0,2484.0,10742.0,1182.0,54.0,325.0,47818121.0,9822.0,47.818121,238.068744,205.403303,3.127196,24.429023,21.820098,11.003538,0.47435,7.685032,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx


Note: Running into memory issues adding spatial weights matrix. Proceeding without spatially-lagged factors for now.

In [6]:
# put coefficients into arrays - variable order from spatial_regression_exploration_kmk
# in future, save out coefficients somewhere 
import numpy as np

wkd_coeff = np.array([-0.1610594,0.0001214,-0.0000173,0.0224169,-0.0152673,-0.0505976,-0.0423512,0.0111763])
sat_coeff = np.array([-0.1424400,0.0001344,-0.0000186,0.0256008,-0.0169793,-0.0408743,-0.0419725,0.0126354])
sun_coeff = np.array([-0.1082477,0.0001477,-0.0000202,0.0209053,-0.0145447,-0.0449611,-0.0502937, 0.0132250])

In [7]:
# multiply by coefficients - everything except n_trips
df_wkd.loc[:, ['n_routes', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']] *= wkd_coeff

In [8]:
# create stop-specific ridership factor
df_wkd2 = (df_wkd
           >> mutate(control_vars_sum = _.n_routes+_.pop_density+_.job_density+_.pct_not_us_citizen_pop+_.pct_youth_pop+_.pct_seniors_pop+_.pct_pop_workers_no_car+_.pct_poverty,
                     control_vars_factor = _.control_vars_sum+(_.n_trips*0.0200865)+6.8721538
           )
          )

In [9]:
# baseline ridership model estimate: np.exp(control_vars_factor + (n_trips*n_trips_coeff) + correction factor)
df_wkd2['model_est_ridership'] = np.exp(df_wkd2['control_vars_factor']+(2.434/2))

In [10]:
pd.options.display.float_format = '{:.2f}'.format

In [11]:
df_wkd2.model_est_ridership.describe()

count                                             77333.00
mean    31505793298073577217747407646277259548647180306...
std     87613866865866581217074806316138136903290171825...
min                                                  43.79
25%                                                1693.76
50%                                                2958.34
75%                                                6320.72
max     24364375131199240854921409288285679099102503136...
Name: model_est_ridership, dtype: float64

In [12]:
cutoff = df_wkd2['model_est_ridership'].quantile(0.99)

In [13]:
# drop outlier stops
df_wkd2 = (df_wkd2
          >> filter(_.model_est_ridership< cutoff)
          )

## Megatable

Establish stop-level ridership estimate: 
- NTD-scaled estmated stop ridership = (model estimate stop ridership* NTD system ridership)/ model estimate system ridership

In [14]:
# pull off system level ridership estimate
sys_riders_wkd = (df_wkd2
                 >> group_by(_.calitp_itp_id,_.name,_.ntd_agency_info_key)
                  >> summarize(sys_model_est_ridership = _.model_est_ridership.sum())
                 )

sys_riders_wkd.head()

Unnamed: 0,calitp_itp_id,name,ntd_agency_info_key,sys_model_est_ridership
0,4,Alameda-Contra Costa Transit District,rec0zt7fBmP2s3F3g,73996708.32
1,6,City of Alhambra,recXx30olivHbdKZS,155258.4
2,11,Amador Regional Transit System,recyQ9Dp6JKnr3Lmr,52090.23
3,14,Anaheim Transportation Network,rec6g6iKaKu4n4r89,478129.01
4,16,Antelope Valley Transit Authority,recTmzDLUS5kOkdqI,2361772.43


In [15]:
#bring on NTD ID
NTD_agency_info = (tbls.mart_transit_database.dim_ntd_agency_info()
                   >> select(_.key,_.ntd_id,_.ntd_agency_name)
                   >> collect()
                  )

In [16]:
NTD_agency_info.sample(5)

Unnamed: 0,key,ntd_id,ntd_agency_name
147,rec8r7tzQX5E4o2Hz,99262,Yurok Tribe
190,rec2moIAHRrs7TrHT,90291,City of South Gate
179,reclJkunUSTdrVPN4,90283,City of Manhattan Beach
82,reclKl4AVv1C5DqaH,90013,Santa Clara Valley Transportation Authority
116,rechRRCbbDqGoU6WU,9R02-91070,Yosemite Area Regional Transportation System


In [17]:
# join ID to system level model estimate
sys_riders_wkd = (sys_riders_wkd
          >> left_join(_,NTD_agency_info, {"ntd_agency_info_key":"key"})
          )

sys_riders_wkd.head()

Unnamed: 0,calitp_itp_id,name,ntd_agency_info_key,sys_model_est_ridership,key,ntd_id,ntd_agency_name
0,4,Alameda-Contra Costa Transit District,rec0zt7fBmP2s3F3g,73996708.32,rec0zt7fBmP2s3F3g,90014,Alameda-Contra Costa Transit District
1,6,City of Alhambra,recXx30olivHbdKZS,155258.4,recXx30olivHbdKZS,90247,City of Alhambra
2,11,Amador Regional Transit System,recyQ9Dp6JKnr3Lmr,52090.23,recyQ9Dp6JKnr3Lmr,9R02-91000,Amador Regional Transit System
3,14,Anaheim Transportation Network,rec6g6iKaKu4n4r89,478129.01,rec6g6iKaKu4n4r89,90211,Anaheim Transportation Network
4,16,Antelope Valley Transit Authority,recTmzDLUS5kOkdqI,2361772.43,recTmzDLUS5kOkdqI,90121,Antelope Valley Transit Authority


In [18]:
# read in NTD ridership
NTD_ridership = pd.read_excel("gs://calitp-analytics-data/data-analyses/2021-Annual-Database-Files/September 2022 Adjusted Database.xlsx", sheet_name="Calendar_Year_UPT")

In [19]:
NTD_ridership.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   5 digit NTD ID  2230 non-null   float64
 1   4 digit NTD ID  2174 non-null   object 
 2   Agency          2230 non-null   object 
 3   Active          2230 non-null   object 
 4   Reporter Type   2230 non-null   object 
 5   UZA             2230 non-null   float64
 6   UZA Name        2235 non-null   object 
 7   Modes           2230 non-null   object 
 8   TOS             2230 non-null   object 
 9   2002            2235 non-null   float64
 10  2003            2235 non-null   float64
 11  2004            2235 non-null   float64
 12  2005            2235 non-null   float64
 13  2006            2235 non-null   float64
 14  2007            2235 non-null   float64
 15  2008            2235 non-null   float64
 16  2009            2235 non-null   float64
 17  2010            2235 non-null   f

In [20]:
# clean names - pip install pyjanitor
from janitor import clean_names

NTD_ridership_clean = (NTD_ridership
                        >> _.clean_names()
                        >> rename(ntd_id_num = "5_digit_ntd_id", ridership_2021="2021")
                       >> mutate(ntd_id_num=_.ntd_id_num.astype(str))
                        >> select(_.ntd_id_num, _.modes, _.tos, _.ridership_2021)
                       >> filter(_.modes.str.endswith("B"))
                       >> group_by(_.ntd_id_num)
                       >> summarize(agg_ridership_2021=_.ridership_2021.sum())
                      )

NTD_ridership_clean.head()

Unnamed: 0,ntd_id_num,agg_ridership_2021
0,1.0,50799232.0
1,10001.0,8496188.0
2,10002.0,0.0
3,10003.0,66369757.0
4,10004.0,1234665.0


In [21]:
# pull off decimal
NTD_ridership_clean['ntd_id'] = NTD_ridership_clean['ntd_id_num'].str.partition('.')[0]

In [22]:
# join NTD agg to system agg, adjust for weekday
sys_riders_wkd = (sys_riders_wkd
                  >> left_join(_,NTD_ridership_clean)
                  >> mutate(ntd_ridership_wkd = _.agg_ridership_2021*(260/365))
                 )

sys_riders_wkd.head()

Unnamed: 0,calitp_itp_id,name,ntd_agency_info_key,sys_model_est_ridership,key,ntd_id,ntd_agency_name,ntd_id_num,agg_ridership_2021,ntd_ridership_wkd
0,4,Alameda-Contra Costa Transit District,rec0zt7fBmP2s3F3g,73996708.32,rec0zt7fBmP2s3F3g,90014,Alameda-Contra Costa Transit District,90014.0,24156305.0,17207230.96
1,6,City of Alhambra,recXx30olivHbdKZS,155258.4,recXx30olivHbdKZS,90247,City of Alhambra,,,
2,11,Amador Regional Transit System,recyQ9Dp6JKnr3Lmr,52090.23,recyQ9Dp6JKnr3Lmr,9R02-91000,Amador Regional Transit System,,,
3,14,Anaheim Transportation Network,rec6g6iKaKu4n4r89,478129.01,rec6g6iKaKu4n4r89,90211,Anaheim Transportation Network,90211.0,3389328.0,2414315.84
4,16,Antelope Valley Transit Authority,recTmzDLUS5kOkdqI,2361772.43,recTmzDLUS5kOkdqI,90121,Antelope Valley Transit Authority,90121.0,991155.0,706028.22


In [24]:
# many missings, but we know that model generally over predicts ridership - what is the mean/median overprediction proportion?
sys_riders_wkd >> mutate(model_over_ntd = _.sys_model_est_ridership/_.ntd_ridership_wkd) >> summarize(med_model_over_ntd = _.model_over_ntd.median(),mean_model_over_ntd = _.model_over_ntd.mean())

Unnamed: 0,med_model_over_ntd,mean_model_over_ntd
0,2.83,inf


In [25]:
# Merge to stop-level ridership
df_wkd3 = (df_wkd2
          >> left_join(_,sys_riders_wkd)
          >> mutate(ntd_scaled_ridership = case_when({
              _.ntd_ridership_wkd>0 : _.model_est_ridership/(_.sys_model_est_ridership/_.ntd_ridership_wkd),
              True : _.model_est_ridership/2.83
          }
          ))
          )

df_wkd3.head()

Unnamed: 0,calitp_itp_id,stop_id,stop_name,n_trips,n_routes,daytype,analysis_date,route_list_string,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_youth_pop,sum_seniors_pop,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs,land_area_sqkm,pop_density,job_density,pct_not_us_citizen_pop,pct_youth_pop,pct_seniors_pop,pct_poverty,pct_pop_workers_no_car,pct_hh_no_cars,itp_id,name,caltrans_district,ntd_agency_info_key,control_vars_sum,control_vars_factor,model_est_ridership,sys_model_est_ridership,key,ntd_id,ntd_agency_name,ntd_id_num,agg_ridership_2021,ntd_ridership_wkd,ntd_scaled_ridership
0,101,2382242,Cimmarron Rd at Cambridge Rd,12,-0.16,Weekday,2022-11-02,"1964,1964,1964,1964,1964,1964,1964,1964,1964,1...",3,14920.0,5273.0,595.0,4645.0,2696.0,14914.0,1201.0,27.0,55.0,70110648.0,924.0,70.11,0.03,-0.0,0.09,-0.48,-0.91,0.09,-0.01,1.04,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx,-1.35,5.76,1071.57,87395.61,recdSdPVhRz0S5Swx,90229,El Dorado County Transit Authority,90229.0,0.0,0.0,378.65
1,101,2456753,Pony Express Trail at Crystal Springs (East),11,-0.16,Weekday,2022-11-02,"1961,1961,1961,1961,1961,1961,1961,1961,1961,1...",3,10744.0,4210.0,160.0,2678.0,2377.0,10718.0,1996.0,56.0,151.0,108093351.0,1326.0,108.09,0.01,-0.0,0.03,-0.38,-1.12,0.21,-0.02,3.59,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx,-1.43,5.66,972.99,87395.61,recdSdPVhRz0S5Swx,90229,El Dorado County Transit Authority,90229.0,0.0,0.0,343.81
2,101,2525339,Raley's (Placerville Dr),9,-0.16,Weekday,2022-11-02,196819681968196819681968196819681968,1,5908.0,2366.0,321.0,1382.0,1279.0,5772.0,847.0,54.0,148.0,26949799.0,3876.0,26.95,0.03,-0.0,0.12,-0.36,-1.1,0.16,-0.04,6.26,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx,-1.34,5.71,1020.03,87395.61,recdSdPVhRz0S5Swx,90229,El Dorado County Transit Authority,90229.0,0.0,0.0,360.44
3,101,2557119,Pony Express Trail at Blair Rd. (West),12,-0.16,Weekday,2022-11-02,"1961,1961,1961,1961,1961,1961,1961,1961,1961,1...",2,7254.0,2886.0,61.0,1886.0,1495.0,7228.0,1629.0,35.0,120.0,61573208.0,734.0,61.57,0.01,-0.0,0.02,-0.4,-1.04,0.25,-0.02,4.16,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx,-1.34,5.78,1089.86,87395.61,recdSdPVhRz0S5Swx,90229,El Dorado County Transit Authority,90229.0,0.0,0.0,385.11
4,101,2562503,Ray Lawyer Drive Park and Ride,4,-0.16,Weekday,2022-11-02,1965196519651965,2,11384.0,4229.0,356.0,2781.0,2484.0,10742.0,1182.0,54.0,325.0,47818121.0,9822.0,47.82,0.03,-0.0,0.07,-0.37,-1.1,0.12,-0.02,7.69,101,El Dorado County Transit Authority,03 - Marysville,recdSdPVhRz0S5Swx,-1.44,5.51,836.95,87395.61,recdSdPVhRz0S5Swx,90229,El Dorado County Transit Authority,90229.0,0.0,0.0,295.74


Explode table to stop-route level

### columns are
Org variables (Calitp id, name, district)
Stop variables (stop id, stop name)
Route variables (route id, route name)
Day type (e.g. weekday/sat/sun)

Every row is daytype-route-stop 

Lookup table with number of new trips (cap at ~20)

Each row is daytype-route-stop-Ntrips

Quick function to multiply the n trips with associated 

In [None]:
# explode routes