In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import numpy as np

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

import gcsfs
fs = gcsfs.GCSFileSystem()



# Collating Big Stop Table

In [2]:
# read in and concatenate all geoparquets
fs_list = fs.ls(f"{GCS_FILE_PATH}tool_data/")

#fs_list[1:]

In [3]:
filelist = []
for f in fs_list[1:]:
    try: 
        test_pqt = pd.read_parquet(f"gs://{f}")
        filelist.append(f)
    except:
        print(f"error on {f.split('tool_data/')}")

In [4]:
df = (pd.concat(
        pd.read_parquet(f"gs://{f}")
        for f in filelist
        )
    )

Note: Running into memory issues adding spatial weights matrix. Proceeding without spatially-lagged factors for now.

In [5]:
# put coefficients into arrays - variable order from spatial_regression_exploration_kmk
# in future, save out coefficients somewhere 
import numpy as np

wkd_coeff = np.array([-0.1610594,0.0001214,-0.0000173,0.0224169,-0.0152673,-0.0505976,-0.0423512,0.0111763])
sat_coeff = np.array([-0.1424400,0.0001344,-0.0000186,0.0256008,-0.0169793,-0.0408743,-0.0419725,0.0126354])
sun_coeff = np.array([-0.1082477,0.0001477,-0.0000202,0.0209053,-0.0145447,-0.0449611,-0.0502937, 0.0132250])

In [6]:
#bring on NTD ID
NTD_agency_info = (tbls.mart_transit_database.dim_ntd_agency_info()
                   >> select(_.key,_.ntd_id,_.ntd_agency_name)
                   >> collect()
                  )

In [7]:
# read in NTD ridership
NTD_ridership = pd.read_excel("gs://calitp-analytics-data/data-analyses/2021-Annual-Database-Files/September 2022 Adjusted Database.xlsx", sheet_name="UPT")

In [9]:
# clean names - pip install pyjanitor
from janitor import clean_names

NTD_ridership_clean = (NTD_ridership
                        >> _.clean_names()
                        >> rename(ntd_id_num = "5_digit_ntd_id")
                       >> mutate(ntd_id_num=_.ntd_id_num.astype(str))
                      )

rider_cols = ["oct21","nov21","dec21","jan22","feb22","mar22","apr22","may22","jun22","jul22","aug22","sep22"]
NTD_ridership_clean['ridership_to_sep22'] = NTD_ridership_clean[rider_cols].sum(axis=1)
                       
NTD_ridership_clean = (NTD_ridership_clean                       
                        >> select(_.ntd_id_num, _.modes, _.tos, _.ridership_to_sep22)
                        >> filter(_.modes.str.endswith("B"))
                        >> group_by(_.ntd_id_num)
                        >> summarize(ntd_ridership=_.ridership_to_sep22.sum())
                      )

NTD_ridership_clean.head()

Unnamed: 0,ntd_id_num,ntd_ridership
0,1.0,61493156.0
1,10001.0,9686309.0
2,10002.0,0.0
3,10003.0,82543984.0
4,10004.0,1361603.0


In [10]:
# pull off decimal
NTD_ridership_clean['ntd_id'] = NTD_ridership_clean['ntd_id_num'].str.partition('.')[0]

In [11]:
# create list of n additional trips, use for final explode
range_trips = list(range(0,21))
range_trips

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

## Megatable

Establish stop-level ridership estimate: 
- NTD-scaled estmated stop ridership = (model estimate stop ridership* NTD system ridership)/ model estimate system ridership\

Expand by Route
Expand by N Additional Trips (0-20)

## Big Function (loop over daytype)

In [17]:
def ridership(daytype,coeff_list,coefficient,constant,variance,daytype_count):
    df1 = (df >> filter(_.daytype==daytype))
    # multiply by coefficients - everything except n_trips
    df1.loc[:, ['n_routes', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                        'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']] *= coeff_list

    # create stop-specific ridership factor
    df1 = (df1
               >> mutate(control_vars_sum = _.n_routes+_.pop_density+_.job_density+_.pct_not_us_citizen_pop+_.pct_youth_pop+_.pct_seniors_pop+_.pct_pop_workers_no_car+_.pct_poverty,
                         control_vars_factor = _.control_vars_sum+(_.n_trips*coefficient)+constant
               )
              )

    # baseline ridership model estimate: np.exp(control_vars_factor + (n_trips*n_trips_coeff) + correction factor)
    df1['model_est_ridership'] = np.exp(df1['control_vars_factor']+(variance/2))
    
    # drop outlier stops
    cutoff = df1['model_est_ridership'].quantile(0.99)
    df2 = (df1
              >> filter(_.model_est_ridership< cutoff)
              )

    # pull off aggregate system level ridership estimate
    sys_riders = (df2
                     >> group_by(_.calitp_itp_id,_.name,_.ntd_agency_info_key)
                      >> summarize(sys_model_est_ridership = _.model_est_ridership.sum())
                     )

    # join ID to system level model estimate
    sys_riders = (sys_riders
              >> left_join(_,NTD_agency_info, {"ntd_agency_info_key":"key"})
              )

    # join NTD agg to system agg ridership, adjust for weekday
    sys_riders = (sys_riders
                      >> left_join(_,NTD_ridership_clean)
                      >> mutate(ntd_ridership_daytype = _.ntd_ridership*(daytype_count/365))
                     )

    # many missings, but we know that model generally over predicts ridership
    # fill with the median overprediction proportion
    med_overpred_df = (sys_riders 
                       >> mutate(model_over_ntd = _.sys_model_est_ridership/_.ntd_ridership_daytype) 
                       >> summarize(med_model_over_ntd = _.model_over_ntd.median())
                      )
    
    med_overpred = med_overpred_df['med_model_over_ntd'].values[0]

    # Merge to stop-level ridership
    df3 = (df2
              >> left_join(_,sys_riders)
              >> mutate(ntd_scaled_ridership = case_when({
                  _.ntd_ridership>0 : _.model_est_ridership/(_.sys_model_est_ridership/_.ntd_ridership_daytype),
                  True : _.model_est_ridership/med_overpred
                  }
                  )
                )
              )

    # turn routelist string into actual list
    df3['route_list_dups'] = df3['route_list_string'].str.split(',')

    # deduplicate list - https://stackoverflow.com/questions/57107125/remove-duplicates-from-python-dataframe-list 
    df3['route_name'] = df3['route_list_dups'].map(np.unique)

    # explode routes
    df4 = df3.explode('route_name')

    df4['n_addtl_trips'] = [range_trips]*len(df4)

    #explode n addtional trips
    df5 = df4.explode('n_addtl_trips')

    coeff_pct = np.exp(coefficient)-1

    # keep only relevant columns
    df5 = (df5
               >> mutate(n_addtl_riders = _.ntd_scaled_ridership.mul(coeff_pct).mul(_.n_addtl_trips))
               >> select(_.calitp_itp_id,_.name,_.daytype,_.route_name,_.stop_id,_.stop_name,_.n_addtl_trips,_.n_addtl_riders)
              )
    
    return df5

In [18]:
df_weekday = ridership("Weekday",
                       wkd_coeff,
                       coefficient=0.0200865,
                       constant=6.8721538,
                       variance=2.434,
                       daytype_count=261
                      )

df_saturday = ridership("Saturday",
                       sat_coeff,
                       coefficient=0.0262958,
                       constant=4.2261843,
                       variance=3.007,
                       daytype_count=52
                      )

df_sunday = ridership("Sunday",
                       sun_coeff,
                       coefficient=0.0263988,
                       constant=3.9734396,
                       variance=3.119,
                       daytype_count=52
                      )

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they. (Deprecated NumPy 1.22)
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they. (Deprecated NumPy 1.22)
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they. (Deprecated NumPy 1.22)


In [19]:
df_all = pd.concat([df_weekday,df_saturday,df_sunday], ignore_index=True)
df_all.to_csv(f"{GCS_FILE_PATH}AHSC_analysis_table.csv", index=False)

In [20]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6650007 entries, 0 to 6650006
Data columns (total 8 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   calitp_itp_id   int64 
 1   name            object
 2   daytype         object
 3   route_name      object
 4   stop_id         object
 5   stop_name       object
 6   n_addtl_trips   object
 7   n_addtl_riders  object
dtypes: int64(1), object(7)
memory usage: 405.9+ MB
