In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pysal

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import shared_utils
import pandas as pd
import geopandas as gpd


from calitp_data_analysis import geography_utils, utils
import numpy as np

import re
import google.auth
import os
import gcsfs
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'


from pysal.model import spreg
from pysal.lib import weights
from libpysal.weights import spatial_lag

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
def read_parquet_from_gcs(filename):
    gcs_path = GCS_FILE_PATH.replace("gs://", "") + filename
    with fs.open(gcs_path, 'rb') as f:
        return gpd.read_parquet(f)

In [6]:
gdf = read_parquet_from_gcs("analytical_tbl_2025.parquet")
len(gdf)

14103

In [7]:
gdf = gdf.dropna()
len(gdf)

13232

In [8]:
# Multiply all percentage columns by 100 for easier OLS interpretation
for column in gdf.columns:
    if 'pct' in column:
        gdf[column] = gdf[column] * 100
gdf.head()

Unnamed: 0,feed_key,stop_id,geometry,stop_name,n_trips_weekday,n_trips_saturday,n_trips_sunday,n_routes_weekday,n_routes_saturday,n_routes_sunday,sat_ons,sun_ons,weekday_ons,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_black_pop,sum_hispanic_pop,sum_youth_pop,sum_seniors_pop,sum_inc_extremelylow,sum_inc_verylow,sum_inc_low,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs,land_area_sqkm,pop_density,job_density,pct_not_us_citizen_pop,pct_black_pop,pct_hispanic_pop,pct_youth_pop,pct_seniors_pop,pct_inc_extremelylow,pct_inc_verylow,pct_inc_low,pct_poverty,pct_pop_workers_no_car,pct_hh_no_cars
0,118c3a62eab691ac449fe0c1c7505413,2,POINT (3437038.068 672539.232),Del Monte Center / Gate 1,38.0,30.0,26.0,7.0,5.0,5.0,2240.0,1740.0,9287.0,3.0,12519.0,1241.0,578.0,216.0,2087.0,2025.0,4303.0,2100.0,1593.0,1673.0,12198.0,881.0,101.0,389.0,55893539.0,12555.0,55.89,223.98,224.62,4.62,1.73,16.67,16.18,34.37,169.22,128.36,134.81,7.22,0.81,8.14
1,118c3a62eab691ac449fe0c1c7505413,3,POINT (3437051.391 672548.520),Del Monte Center / Gate 2,31.0,27.0,23.0,6.0,5.0,4.0,616.0,348.0,1757.0,3.0,12519.0,1241.0,578.0,216.0,2087.0,2025.0,4303.0,2100.0,1593.0,1673.0,12198.0,881.0,101.0,389.0,55893539.0,12555.0,55.89,223.98,224.62,4.62,1.73,16.67,16.18,34.37,169.22,128.36,134.81,7.22,0.81,8.14
2,118c3a62eab691ac449fe0c1c7505413,4,POINT (3436895.837 672590.854),Del Monte Center / Gate 3,25.0,12.0,11.0,5.0,3.0,2.0,56.0,58.0,502.0,3.0,12519.0,1241.0,578.0,216.0,2087.0,2025.0,4303.0,2100.0,1593.0,1673.0,12198.0,881.0,101.0,389.0,55893539.0,12555.0,55.89,223.98,224.62,4.62,1.73,16.67,16.18,34.37,169.22,128.36,134.81,7.22,0.81,8.14
3,118c3a62eab691ac449fe0c1c7505413,6,POINT (3433486.450 670499.928),6th / Mission Street,91.0,86.0,77.0,6.0,6.0,6.0,4312.0,4234.0,21837.0,2.0,3186.0,139.0,246.0,29.0,122.0,147.0,1870.0,461.0,628.0,535.0,3184.0,155.0,47.0,111.0,2747956.0,2572.0,2.75,1159.41,935.97,7.72,0.91,3.83,4.61,58.69,331.65,451.8,384.89,4.87,1.48,33.81
4,118c3a62eab691ac449fe0c1c7505413,11,POINT (3464418.138 676021.248),Northridge Mall,142.0,136.0,132.0,6.0,7.0,7.0,9128.0,13224.0,40662.0,3.0,15642.0,4793.0,4421.0,315.0,13109.0,6723.0,1774.0,3580.0,3006.0,1563.0,15615.0,1866.0,47.0,289.0,31903783.0,5849.0,31.9,490.29,183.33,28.26,2.01,83.81,42.98,11.34,74.69,62.72,32.61,11.95,0.3,0.98


In [9]:
# log of ons
gdf['log_weekday_ons'] = (np.log(gdf['weekday_ons'])).replace(np.NINF, 0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
# add saturday/sunday ons
gdf['log_sat_ons'] = (np.log(gdf['sat_ons'])).replace(np.NINF, 0) 
gdf['log_sun_ons'] = (np.log(gdf['sun_ons'])).replace(np.NINF, 0) 

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
explanatory_vars = ['n_trips_weekday', 'n_routes_weekday', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

In [12]:
explanatory_vars_sat = ['n_trips_saturday', 'n_routes_saturday', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

explanatory_vars_sun = ['n_trips_sunday', 'n_routes_sunday', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

OLS Untransformed Model 

In [13]:
# model with more explanatory variables
m = spreg.OLS(gdf[['weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'weekday_ons', name_x = explanatory_vars)
print(m.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : weekday_ons                Number of Observations:       13232
Mean dependent var  :   9883.0938                Number of Variables   :          10
S.D. dependent var  :  22777.6611                Degrees of Freedom    :       13222
R-squared           :      0.3662
Adjusted R-squared  :      0.3657
Sum squared residual:   4.351e+12                F-statistic           :    848.6911
Sigma-square        :329072841.611                Prob(F-statistic)     :           0
S.E. of regression  :   18140.365                Log likelihood        : -148521.993
Sigma-square ML     :328824146.900                Akaike info criterion :  297063.987
S.E of regression ML:  18133.5090                Schwarz criterion     :  297138.891

----------------------------------------------------------

OLS Log Linear Model

In [14]:
# log model with more vars
m_log = spreg.OLS(gdf[['log_weekday_ons']].values, gdf[explanatory_vars].values,
                  name_y = 'log_weekday_ons', name_x = explanatory_vars)
print(m_log.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :log_weekday_ons                Number of Observations:       13232
Mean dependent var  :      7.7297                Number of Variables   :          10
S.D. dependent var  :      2.0553                Degrees of Freedom    :       13222
R-squared           :      0.4249
Adjusted R-squared  :      0.4245
Sum squared residual:     32141.1                F-statistic           :   1085.4544
Sigma-square        :       2.431                Prob(F-statistic)     :           0
S.E. of regression  :       1.559                Log likelihood        :  -24647.071
Sigma-square ML     :       2.429                Akaike info criterion :   49314.141
S.E of regression ML:      1.5585                Schwarz criterion     :   49389.045

---------------------------------------------------------

In [15]:
# saturday
m_log_sat = spreg.OLS(gdf[['log_sat_ons']].values, gdf[explanatory_vars_sat].values,
                  name_y = 'log_sat_ons', name_x = explanatory_vars_sat)
print(m_log_sat.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : log_sat_ons                Number of Observations:       13232
Mean dependent var  :      5.3666                Number of Variables   :          10
S.D. dependent var  :      2.2387                Degrees of Freedom    :       13222
R-squared           :      0.4070
Adjusted R-squared  :      0.4066
Sum squared residual:     39322.4                F-statistic           :   1008.2562
Sigma-square        :       2.974                Prob(F-statistic)     :           0
S.E. of regression  :       1.725                Log likelihood        :  -25981.245
Sigma-square ML     :       2.972                Akaike info criterion :   51982.490
S.E of regression ML:      1.7239                Schwarz criterion     :   52057.394

------------------------------------------------------------

In [16]:
# sunday
m_log_sun = spreg.OLS(gdf[['log_sun_ons']].values, gdf[explanatory_vars_sun].values,
                  name_y = 'log_sun_ons', name_x = explanatory_vars_sun)
print(m_log_sun.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : log_sun_ons                Number of Observations:       13232
Mean dependent var  :      5.1209                Number of Variables   :          10
S.D. dependent var  :      2.2646                Degrees of Freedom    :       13222
R-squared           :      0.3978
Adjusted R-squared  :      0.3974
Sum squared residual:       40861                F-statistic           :    970.5742
Sigma-square        :       3.090                Prob(F-statistic)     :           0
S.E. of regression  :       1.758                Log likelihood        :  -26235.191
Sigma-square ML     :       3.088                Akaike info criterion :   52490.383
S.E of regression ML:      1.7573                Schwarz criterion     :   52565.286

------------------------------------------------------------

In [17]:
#exponentiate coefficients to interpret
import math

m_log_df = pd.DataFrame() 
m_log_df['Variable']=m_log.name_x
m_log_df['Coefficient']=m_log.betas
m_log_df.dtypes

Variable        object
Coefficient    float64
dtype: object

In [18]:
m_log_df['Coeff_Exp']=(np.exp(m_log_df['Coefficient'])-1)*100
m_log_df

Unnamed: 0,Variable,Coefficient,Coeff_Exp
0,CONSTANT,6.64,76541.64
1,n_trips_weekday,0.02,2.05
2,n_routes_weekday,-0.16,-15.03
3,pop_density,0.0,0.01
4,job_density,-0.0,-0.0
5,pct_not_us_citizen_pop,0.03,2.75
6,pct_youth_pop,-0.01,-1.18
7,pct_seniors_pop,-0.04,-3.83
8,pct_pop_workers_no_car,-0.03,-2.53
9,pct_poverty,0.02,1.58


In [19]:
#Gravity distance weights
w_dist = weights.DistanceBand.from_dataframe(gdf,threshold=400,binary=False,alpha=-2.) #square root decay

  return self._with_data(data ** n)
 There are 1322 disconnected components.
 There are 93 islands with ids: 90, 107, 114, 120, 129, 582, 1171, 1287, 1288, 1289, 1290, 1335, 1391, 1804, 2032, 2840, 3549, 5334, 5407, 5903, 5911, 6150, 6597, 6628, 6675, 6885, 6927, 7041, 7236, 7262, 7272, 7402, 7785, 7890, 7907, 7974, 7984, 8130, 8455, 9263, 9326, 9439, 9449, 9450, 9451, 9452, 9532, 9565, 9566, 9567, 9568, 9982, 10047, 10048, 10049, 10050, 10273, 10274, 10275, 10276, 10298, 10299, 10300, 10301, 10973, 10974, 10975, 10976, 11029, 11030, 11031, 11032, 11050, 11051, 11052, 11053, 11063, 11064, 11065, 11066, 11191, 11192, 11193, 11194, 11521, 12016, 12535, 12971, 12980, 12982, 12989, 13512, 13555.
  w = W(neighbors, weights, ids, **kwargs)
 There are 1322 disconnected components.
 There are 93 islands with ids: 90, 107, 114, 120, 129, 582, 1171, 1287, 1288, 1289, 1290, 1335, 1391, 1804, 2032, 2840, 3549, 5334, 5407, 5903, 5911, 6150, 6597, 6628, 6675, 6885, 6927, 7041, 7236, 7262, 7272, 7402

In [20]:
# Select columns directly using pandas
selected_cols = gdf[['pop_density', 'pct_not_us_citizen_pop', 'pct_poverty']]

# Apply spatial lag function column-wise
wx = selected_cols.apply(lambda y: weights.spatial_lag.lag_spatial(w_dist, y))

# Rename columns by adding 'w_' prefix
wx = wx.rename(columns=lambda c: 'w_' + c)

# Show first few rows
wx.head()


Unnamed: 0,w_pop_density,w_pct_not_us_citizen_pop,w_pct_poverty
0,0.88,0.02,0.03
1,0.88,0.02,0.03
2,0.06,0.0,0.0
3,0.1,0.0,0.0
4,0.01,0.0,0.0


In [21]:
gdf_exog = gdf[explanatory_vars].join(wx)

In [22]:
# log model with more vars
m_log_slx = spreg.OLS(gdf[['log_weekday_ons']].values, gdf_exog.values, w=w_dist,
                  name_y = 'log_weekday_ons', name_x = gdf_exog.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  :log_weekday_ons                Number of Observations:       13232
Mean dependent var  :      7.7297                Number of Variables   :          13
S.D. dependent var  :      2.0553                Degrees of Freedom    :       13219
R-squared           :      0.4279
Adjusted R-squared  :      0.4274
Sum squared residual:     31975.1                F-statistic           :    823.8468
Sigma-square        :       2.419                Prob(F-statistic)     :           0
S.E. of regression  :       1.555                Log likelihood        :  -24612.825
Sigma-square ML     :       2.416                Akaike info criterion :   49251.649
S.E of regression ML:      1.5545                Schwarz criterion     :   49349.024

-----------------------------------------------

In [23]:
gdf_exog_sat = gdf[explanatory_vars_sat].join(wx)

In [24]:
# saturday
m_log_slx_sat = spreg.OLS(gdf[['log_sat_ons']].values, gdf_exog_sat.values, w=w_dist,
                  name_y = 'log_sat_ons', name_x = gdf_exog_sat.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx_sat.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  : log_sat_ons                Number of Observations:       13232
Mean dependent var  :      5.3666                Number of Variables   :          13
S.D. dependent var  :      2.2387                Degrees of Freedom    :       13219
R-squared           :      0.4100
Adjusted R-squared  :      0.4095
Sum squared residual:     39119.3                F-statistic           :    765.6626
Sigma-square        :       2.959                Prob(F-statistic)     :           0
S.E. of regression  :       1.720                Log likelihood        :  -25946.993
Sigma-square ML     :       2.956                Akaike info criterion :   51919.986
S.E of regression ML:      1.7194                Schwarz criterion     :   52017.361

--------------------------------------------------

In [25]:
gdf_exog_sun = gdf[explanatory_vars_sun].join(wx)

In [26]:
# sunday
m_log_slx_sun = spreg.OLS(gdf[['log_sun_ons']].values, gdf_exog_sun.values, w=w_dist,
                  name_y = 'log_sun_ons', name_x = gdf_exog_sun.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx_sun.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  : log_sun_ons                Number of Observations:       13232
Mean dependent var  :      5.1209                Number of Variables   :          13
S.D. dependent var  :      2.2646                Degrees of Freedom    :       13219
R-squared           :      0.4008
Adjusted R-squared  :      0.4002
Sum squared residual:     40661.6                F-statistic           :    736.7387
Sigma-square        :       3.076                Prob(F-statistic)     :           0
S.E. of regression  :       1.754                Log likelihood        :  -26202.818
Sigma-square ML     :       3.073                Akaike info criterion :   52431.637
S.E of regression ML:      1.7530                Schwarz criterion     :   52529.012

--------------------------------------------------

In [27]:
m_log_slx_df = pd.DataFrame() 
m_log_slx_df['Variable']=m_log_slx.name_x
m_log_slx_df['Coefficient']=m_log_slx.betas
m_log_slx_df['Pval']=m_log_slx.t_stat

m_log_slx_df['Coeff_Exp']=(np.exp(m_log_slx_df['Coefficient'])-1)*100
m_log_slx_df

Unnamed: 0,Variable,Coefficient,Pval,Coeff_Exp
0,CONSTANT,6.7,"(67.15357077639696, 0.0)",80957.33
1,n_trips_weekday,0.02,"(57.46631318585172, 0.0)",2.06
2,n_routes_weekday,-0.16,"(-7.114300983511134, 1.1827433446707806e-12)",-15.12
3,pop_density,0.0,"(11.584936652119769, 6.9264392387140995e-31)",0.01
4,job_density,-0.0,"(-10.254390642254931, 1.3978591662633129e-24)",-0.0
5,pct_not_us_citizen_pop,0.03,"(10.291286057117443, 9.564039410021875e-25)",2.72
6,pct_youth_pop,-0.01,"(-5.867519146402857, 4.5295616612553e-09)",-1.24
7,pct_seniors_pop,-0.04,"(-13.65088898206265, 3.849325278083372e-42)",-3.94
8,pct_pop_workers_no_car,-0.03,"(-3.5843924531832676, 0.00033908023396666024)",-2.9
9,pct_poverty,0.02,"(6.573678597917971, 5.093000914904623e-11)",1.84


K Nearest Neighbours

In [28]:
# How many neighbors in 0.25 miles?
w_dist.mean_neighbors

5.268591293833132

In [29]:
# 6 nearest neighbors
knn = weights.KNN.from_dataframe(gdf, k=6)

 There are 30 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


In [30]:
# Re-weight W to 6 nearest neighbors
knn.reweight(k=6, inplace=True)
# Row standardise weights
knn.transform = "R"

In [31]:
# List of variables for which to compute spatial lags
columns_to_lag = [
    'n_trips_weekday', 'n_routes_weekday', 'pop_density', 'job_density',
    'pct_not_us_citizen_pop', 'pct_youth_pop', 'pct_seniors_pop',
    'pct_pop_workers_no_car', 'pct_poverty'
]

# Compute spatial lags and rename with 'w_' prefix
wx = pd.DataFrame()

for col in columns_to_lag:
    wx[f"w_{col}"] = spatial_lag.lag_spatial(knn, gdf[col])

# Drop one of the lagged columns, e.g., 'w_n_trips_weekday'
wx.drop(columns=['w_n_trips_weekday'], inplace=True)

# Show head of the resulting DataFrame
wx.head()

Unnamed: 0,w_n_routes_weekday,w_pop_density,w_job_density,w_pct_not_us_citizen_pop,w_pct_youth_pop,w_pct_seniors_pop,w_pct_pop_workers_no_car,w_pct_poverty
0,3.83,542.09,447.83,5.21,15.71,33.52,0.92,7.46
1,4.0,542.09,447.83,5.21,15.71,33.52,0.92,7.46
2,4.17,542.09,447.83,5.21,15.71,33.52,0.92,7.46
3,1.67,1055.46,752.24,6.68,8.06,54.82,1.27,5.08
4,4.17,461.13,179.97,27.37,42.67,11.52,0.29,11.7


In [32]:
gdf_exog = gdf[explanatory_vars].join(wx)

In [33]:
# log model with more vars
m_log_slx = spreg.OLS(gdf[['log_weekday_ons']].values, gdf_exog.values, w=knn,
                  name_y = 'log_weekday_ons', name_x = gdf_exog.columns.tolist(), name_w="KNN")
print(m_log_slx.summary)

Exception: one or more input arrays have missing/NaN values