In [2]:
#https://pysal.org/libpysal/notebooks/weights.html
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from libpysal.weights import Queen, Rook, KNN

from pysal.lib import weights
from pysal.explore import esda
from pysal.model import spreg

from shapely import wkb
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# import contextily
import geopandas as gpd
import numpy as np
import pandas as pd
import seaborn as sns

import branca
import folium
import shared_utils

from siuba import *

pd.set_option('display.max_columns', None) 

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [9]:
gdf = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/ahsc_grant/analytical_tbl.parquet')
len(gdf)

13746

In [10]:
gdf = gdf.dropna()
len(gdf)

12988

In [11]:
# Multiply all percentage columns by 100 for easier OLS interpretation
for column in gdf.columns:
    if 'pct' in column:
        gdf[column] = gdf[column] * 100
gdf.head()

Unnamed: 0,calitp_itp_id,stop_id,geometry,stop_name,n_trips_weekday,n_trips_sat,n_trips_sun,n_routes_weekday,n_routes_sat,n_routes_sun,sat_ons,sun_ons,weekday_ons,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_black_pop,sum_hispanic_pop,sum_youth_pop,sum_seniors_pop,sum_inc_extremelylow,sum_inc_verylow,sum_inc_low,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs,land_area_sqkm,pop_density,job_density,pct_not_us_citizen_pop,pct_black_pop,pct_hispanic_pop,pct_youth_pop,pct_seniors_pop,pct_inc_extremelylow,pct_inc_verylow,pct_inc_low,pct_poverty,pct_pop_workers_no_car,pct_hh_no_cars
0,293,423,POINT (25245.404 -397427.355),State & Alamar,60.0,56.0,55.0,3.0,2.0,2.0,567.0,566.0,4556.0,4.0,20153.0,9058.0,1370.0,180.0,3890.0,4776.0,4677.0,1124.0,1132.0,1703.0,19849.0,1676.0,115.0,448.0,8970055.0,12021.0,8.970055,2246.697484,1340.125562,6.797995,0.893167,19.302337,23.698705,23.207463,12.40892,12.49724,18.80106,8.44375,0.570635,4.945904
1,293,55,POINT (15628.312 -397010.130),Encina & Fairview,25.0,11.0,11.0,1.0,1.0,1.0,840.0,806.0,10303.0,2.0,7287.0,2572.0,644.0,133.0,1653.0,2109.0,1670.0,165.0,266.0,517.0,7287.0,336.0,34.0,61.0,6030586.0,2686.0,6.030586,1208.340284,445.396185,8.837656,1.825168,22.684232,28.941951,22.917524,6.415241,10.342146,20.101089,4.610951,0.466584,2.371695
2,293,239,POINT (23548.788 -398980.627),Veronica Springs & Torino,12.0,12.0,8.0,1.0,1.0,1.0,28.0,10.0,175.0,1.0,3455.0,1502.0,310.0,0.0,649.0,765.0,1064.0,141.0,186.0,262.0,3371.0,206.0,0.0,161.0,4129069.0,383.0,4.129069,836.750367,92.756987,8.972504,0.0,18.78437,22.141823,30.795948,9.387483,12.383489,17.443409,6.110946,0.0,10.719041
3,293,63,POINT (28266.459 -399563.850),Haley & Garden,25.0,21.0,14.0,1.0,1.0,1.0,148.0,85.0,978.0,2.0,10410.0,3823.0,1792.0,313.0,6028.0,3492.0,1260.0,920.0,611.0,741.0,10228.0,1877.0,189.0,499.0,4544564.0,23265.0,4.544564,2290.648784,5119.302974,17.214217,3.006724,57.90586,33.544669,12.103746,24.064871,15.982213,19.382684,18.351584,1.815562,13.052577
5,293,320,POINT (46477.678 -403032.814),Via Real & Vista de Santa Barbara,25.0,21.0,15.0,1.0,1.0,1.0,47.0,22.0,262.0,1.0,3309.0,1132.0,758.0,55.0,1985.0,1059.0,585.0,141.0,183.0,257.0,3309.0,306.0,7.0,31.0,6231353.0,3022.0,6.231353,531.024322,484.966908,22.907223,1.662134,59.987912,32.003626,17.679057,12.45583,16.166078,22.70318,9.247507,0.211544,2.738516


In [12]:
# log of ons
gdf['log_weekday_ons'] = (np.log(gdf['weekday_ons'])).replace(np.NINF, 0) #replace undefined values produced from np.log(0) w/ 0

In [13]:
# add saturday/sunday ons
gdf['log_sat_ons'] = (np.log(gdf['sat_ons'])).replace(np.NINF, 0) 
gdf['log_sun_ons'] = (np.log(gdf['sun_ons'])).replace(np.NINF, 0) 

In [14]:
# Weekday_ons ~ n_trips_weekday + pop_density + job_density + pct: 
# Intuition is to only use use pct variables because we don’t want to double count people in areas around multiple stops. Whereas, trips are characteristic of the stop itself

explanatory_vars = ['n_trips_weekday', 'n_routes_weekday', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

#explanatory_vars_short = ['n_trips_weekday', 'n_routes_weekday','pop_density', 'pct_not_us_citizen_pop','pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car']


In [15]:
# add saturday/sunday version
explanatory_vars_sat = ['n_trips_sat', 'n_routes_sat', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

explanatory_vars_sun = ['n_trips_sun', 'n_routes_sun', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']


Comparing version with job_density and poverty to one without, to evaluate multicollinearity. Had race variables, but dropped them due to lack of correlation with ridership.

### OLS Untransformed model

In [None]:
# model with more explanatory variables
m = spreg.OLS(gdf[['weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'weekday_ons', name_x = explanatory_vars)
print(m.summary)

### OLS Log-Linear model

In [9]:
# log model with more vars
m_log = spreg.OLS(gdf[['log_weekday_ons']].values, gdf[explanatory_vars].values,
                  name_y = 'log_weekday_ons', name_x = explanatory_vars)
print(m_log.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :log_weekday_ons                Number of Observations:       12988
Mean dependent var  :      7.7242                Number of Variables   :          10
S.D. dependent var  :      2.0521                Degrees of Freedom    :       12978
R-squared           :      0.4224
Adjusted R-squared  :      0.4220
Sum squared residual:   31591.278                F-statistic           :   1054.4540
Sigma-square        :       2.434                Prob(F-statistic)     :           0
S.E. of regression  :       1.560                Log likelihood        :  -24201.399
Sigma-square ML     :       2.432                Akaike info criterion :   48422.799
S.E of regression ML:      1.5596                Schwarz criterion     :   48497.517

--------------------------------------------------------------------------

In [16]:
# saturday
m_log_sat = spreg.OLS(gdf[['log_sat_ons']].values, gdf[explanatory_vars_sat].values,
                  name_y = 'log_sat_ons', name_x = explanatory_vars_sat)
print(m_log_sat.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : log_sat_ons                Number of Observations:       12988
Mean dependent var  :      5.3570                Number of Variables   :          10
S.D. dependent var  :      2.2400                Degrees of Freedom    :       12978
R-squared           :      0.4012
Adjusted R-squared  :      0.4008
Sum squared residual:   39019.108                F-statistic           :    966.1157
Sigma-square        :       3.007                Prob(F-statistic)     :           0
S.E. of regression  :       1.734                Log likelihood        :  -25572.740
Sigma-square ML     :       3.004                Akaike info criterion :   51165.480
S.E of regression ML:      1.7333                Schwarz criterion     :   51240.198

-----------------------------------------------------------------------------

In [17]:
# sunday
m_log_sun = spreg.OLS(gdf[['log_sun_ons']].values, gdf[explanatory_vars_sun].values,
                  name_y = 'log_sun_ons', name_x = explanatory_vars_sun)
print(m_log_sun.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : log_sun_ons                Number of Observations:       12988
Mean dependent var  :      5.1115                Number of Variables   :          10
S.D. dependent var  :      2.2650                Degrees of Freedom    :       12978
R-squared           :      0.3925
Adjusted R-squared  :      0.3921
Sum squared residual:   40477.196                F-statistic           :    931.5544
Sigma-square        :       3.119                Prob(F-statistic)     :           0
S.E. of regression  :       1.766                Log likelihood        :  -25810.987
Sigma-square ML     :       3.117                Akaike info criterion :   51641.975
S.E of regression ML:      1.7654                Schwarz criterion     :   51716.692

-----------------------------------------------------------------------------

In [10]:
#exponentiate coefficients to interpret
import math

m_log_df = pd.DataFrame() 
m_log_df['Variable']=m_log.name_x
m_log_df['Coefficient']=m_log.betas
m_log_df.dtypes

Variable        object
Coefficient    float64
dtype: object

In [11]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

m_log_df['Coeff_Exp']=(np.exp(m_log_df['Coefficient'])-1)*100
m_log_df

Unnamed: 0,Variable,Coefficient,Coeff_Exp
0,CONSTANT,6.87215,96402.47526
1,n_trips_weekday,0.02009,2.02896
2,n_routes_weekday,-0.16106,-14.87585
3,pop_density,0.00012,0.01214
4,job_density,-2e-05,-0.00173
5,pct_not_us_citizen_pop,0.02242,2.26701
6,pct_youth_pop,-0.01527,-1.51513
7,pct_seniors_pop,-0.0506,-4.93388
8,pct_pop_workers_no_car,-0.04235,-4.14669
9,pct_poverty,0.01118,1.12389


## Spatial Weights

### distance (400m ~ 0.25 mi)

In [18]:
#Gravity distance weights
w_dist = weights.DistanceBand.from_dataframe(gdf,threshold=400,binary=False,alpha=-2.) #square root decay

In [19]:
wx = ((gdf >> select(_['pop_density','pct_not_us_citizen_pop','pct_poverty']
        # Compute the spatial lag of each of those variables
    ))
    .apply(
        lambda y: weights.spatial_lag.lag_spatial(w_dist, y)
        # Rename the spatial lag, adding w_ to the original name
    )
    .rename(
        columns=lambda c: "w_"
        + c
    )
) 

wx.head()

Unnamed: 0,w_pop_density,w_pct_not_us_citizen_pop,w_pct_poverty
0,0.235616,0.000685,0.0009
1,1.370388,0.011024,0.006134
2,0.151971,0.001656,0.001148
3,0.334783,0.002427,0.002501
5,0.062159,0.002681,0.001082


In [20]:
gdf_exog = gdf[explanatory_vars].join(wx)

In [38]:
# log model with more vars
m_log_slx = spreg.OLS(gdf[['log_weekday_ons']].values, gdf_exog.values, w=w_dist,
                  name_y = 'log_weekday_ons', name_x = gdf_exog.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  :log_weekday_ons                Number of Observations:       12988
Mean dependent var  :      7.7242                Number of Variables   :          13
S.D. dependent var  :      2.0521                Degrees of Freedom    :       12975
R-squared           :      0.4273
Adjusted R-squared  :      0.4268
Sum squared residual:   31321.495                F-statistic           :    806.7811
Sigma-square        :       2.414                Prob(F-statistic)     :           0
S.E. of regression  :       1.554                Log likelihood        :  -24145.704
Sigma-square ML     :       2.412                Akaike info criterion :   48317.408
S.E of regression ML:      1.5529                Schwarz criterion     :   48414.541

----------------------------------------------------------------

In [21]:
gdf_exog_sat = gdf[explanatory_vars_sat].join(wx)

In [22]:
# saturday
m_log_slx_sat = spreg.OLS(gdf[['log_sat_ons']].values, gdf_exog_sat.values, w=w_dist,
                  name_y = 'log_sat_ons', name_x = gdf_exog_sat.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx_sat.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  : log_sat_ons                Number of Observations:       12988
Mean dependent var  :      5.3570                Number of Variables   :          13
S.D. dependent var  :      2.2400                Degrees of Freedom    :       12975
R-squared           :      0.4056
Adjusted R-squared  :      0.4051
Sum squared residual:   38730.716                F-statistic           :    737.8644
Sigma-square        :       2.985                Prob(F-statistic)     :           0
S.E. of regression  :       1.728                Log likelihood        :  -25524.564
Sigma-square ML     :       2.982                Akaike info criterion :   51075.129
S.E of regression ML:      1.7269                Schwarz criterion     :   51172.262

-------------------------------------------------------------------

In [23]:
gdf_exog_sun = gdf[explanatory_vars_sun].join(wx)

In [24]:
# sunday
m_log_slx_sun = spreg.OLS(gdf[['log_sun_ons']].values, gdf_exog_sun.values, w=w_dist,
                  name_y = 'log_sun_ons', name_x = gdf_exog_sun.columns.tolist(), name_w="Distance Decay Weights") 
print(m_log_slx_sun.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :Distance Decay Weights
Dependent Variable  : log_sun_ons                Number of Observations:       12988
Mean dependent var  :      5.1115                Number of Variables   :          13
S.D. dependent var  :      2.2650                Degrees of Freedom    :       12975
R-squared           :      0.3970
Adjusted R-squared  :      0.3964
Sum squared residual:   40176.816                F-statistic           :    711.8106
Sigma-square        :       3.096                Prob(F-statistic)     :           0
S.E. of regression  :       1.760                Log likelihood        :  -25762.616
Sigma-square ML     :       3.093                Akaike info criterion :   51551.231
S.E of regression ML:      1.7588                Schwarz criterion     :   51648.365

-------------------------------------------------------------------

In [39]:
m_log_slx_df = pd.DataFrame() 
m_log_slx_df['Variable']=m_log_slx.name_x
m_log_slx_df['Coefficient']=m_log_slx.betas
m_log_slx_df['Pval']=m_log_slx.t_stat

m_log_slx_df['Coeff_Exp']=(np.exp(m_log_slx_df['Coefficient'])-1)*100
m_log_slx_df

Unnamed: 0,Variable,Coefficient,Pval,Coeff_Exp
0,CONSTANT,6.93609,"(64.66940266488844, 0.0)",102774.43279
1,n_trips_weekday,0.02012,"(54.88504336122609, 0.0)",2.03237
2,n_routes_weekday,-0.16068,"(-6.630550150175914, 3.476806966768003e-11)",-14.84337
3,pop_density,0.00011,"(14.43673594257098, 7.011687766720369e-47)",0.01062
4,job_density,-2e-05,"(-7.532820319999475, 5.293904162975447e-14)",-0.00168
5,pct_not_us_citizen_pop,0.01723,"(6.090748749126994, 1.1556089523363187e-09)",1.73815
6,pct_youth_pop,-0.01574,"(-6.611021749994605, 3.966129306564722e-11)",-1.56149
7,pct_seniors_pop,-0.05199,"(-14.888064024544025, 1.0131649041856516e-49)",-5.06573
8,pct_pop_workers_no_car,-0.04791,"(-6.356503248015798, 2.1330782164781204e-10)",-4.67844
9,pct_poverty,0.01568,"(5.857381804416662, 4.816166131917372e-09)",1.58056


### K Nearest Neighbors

In [None]:
# How many neighbors in 0.25 miles?
w_dist.mean_neighbors

In [None]:
# 6 nearest neighbors
knn = weights.KNN.from_dataframe(gdf, k=6)

From context of https://geographicdata.science/book/notebooks/11_regression.html

In [None]:
# Re-weight W to 6 nearest neighbors
knn.reweight(k=6, inplace=True)
# Row standardise weights
knn.transform = "R"

In [None]:
wx = (((gdf >> select(_['n_trips_weekday', 'n_routes_weekday', 'pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']
        # Compute the spatial lag of each of those variables
    ))
    .apply(
        lambda y: weights.spatial_lag.lag_spatial(knn, y)
        # Rename the spatial lag, adding w_ to the original name
    )
    .rename(
        columns=lambda c: "w_"
        + c
    )
) >> select(-_.w_n_trips_weekday))

wx.head()

In [None]:
gdf_exog = gdf[explanatory_vars].join(wx)

In [None]:
# log model with more vars
m_log_slx = spreg.OLS(gdf[['log_weekday_ons']].values, gdf_exog.values, w=knn,
                  name_y = 'log_weekday_ons', name_x = gdf_exog.columns.tolist(), name_w="KNN")
print(m_log_slx.summary)

Conclusion: use distance decay weights matrix, get rid of non-significant variables iteratively