In [1]:
#https://pysal.org/libpysal/notebooks/weights.html
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from libpysal.weights import Queen, Rook, KNN

from pysal.lib import weights
from pysal.explore import esda
from pysal.model import spreg

from shapely import wkb
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# import contextily
import geopandas as gpd
import numpy as np
import pandas as pd
import seaborn as sns

import branca
import folium
import shared_utils

from siuba import *

pd.set_option('display.max_columns', None) 

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


ImportError: cannot import name 'tbl' from 'calitp.tables' (/opt/conda/lib/python3.10/site-packages/calitp/tables.py)

In [None]:
gdf = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/ahsc_grant/analytical_tbl.parquet')
gdf = gdf.dropna()
gdf

In [None]:
gdf['weekday_ons'].describe()

In [None]:
sns.histplot(gdf['weekday_ons'])

In [None]:
sns.histplot(np.log(gdf['weekday_ons']))

In [None]:
sns.histplot(gdf['pop_density'])

In [None]:
sns.histplot(np.log(gdf['pop_density']))

In [None]:
sns.histplot(np.log(gdf['job_density']))

In [None]:
gdf.columns

In [None]:
# Multiply all percentage columns by 100 for easier OLS interpretation
for column in gdf.columns:
    if 'pct' in column:
        gdf[column] = gdf[column] * 100
gdf.head()

In [None]:
# add constant to ridership to avoid log(0) - small bias but empirically supported
gdf['log_weekday_ons'] = np.log(gdf['weekday_ons']+1)
#gdf['log_weekday_ons'] = gdf['log_weekday_ons'].replace(np.NINF, 0) #replace undefined values produced from np.log(0) w/ 0

# also log pop and job density
gdf['log_pop_density'] = np.log(gdf['pop_density']+1)
gdf['log_job_density'] = np.log(gdf['job_density']+1)

In [None]:
# Weekday_ons ~ n_trips_weekday + pop_density + job_density + pct: 
# Intuition is to only use use pct variables because we don’t want to double count people in areas around multiple stops. Whereas, trips are characteristic of the stop itself

explanatory_vars = ['n_trips_weekday','pop_density', 'job_density','pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']

explanatory_vars_short = ['n_trips_weekday','pop_density', 'pct_not_us_citizen_pop',
                    'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car']


Comparing version with job_density and poverty to one without, to evaluate multicollinearity. Had race variables, but dropped them due to lack of correlation with ridership.

### OLS Untransformed model

In [None]:
# model with more explanatory variables
m = spreg.OLS(gdf[['weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'weekday_ons', name_x = explanatory_vars)
print(m.summary)

In [None]:
# model with fewer explanatory variables
m2 = spreg.OLS(gdf[['weekday_ons']].values, gdf[explanatory_vars_short].values, 
                  name_y = 'weekday_ons', name_x = explanatory_vars_short)
print(m2.summary)

### OLS Log-Linear model

In [None]:
gdf['log_weekday_ons'].replace(np.NINF, 0).value_counts()

In [None]:
# log model with more vars
m_log = spreg.OLS(gdf[['log_weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'log_weekday_ons', name_x = explanatory_vars)
print(m_log.summary)

In [None]:
# log model with fewer vars
m_log2 = spreg.OLS(gdf[['log_weekday_ons']].values, gdf[explanatory_vars_short].values, 
                  name_y = 'log_weekday_ons', name_x = explanatory_vars_short)
print(m_log2.summary)

In [None]:
#exponentiate coefficients to interpret
import math

m_log_df = pd.DataFrame() 
m_log_df['Variable']=m_log.name_x
m_log_df['Coefficient']=m_log.betas
m_log_df.dtypes

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

m_log_df['Coeff_Exp']=(np.exp(m_log_df['Coefficient'])-1)*100
m_log_df

In [None]:
# testing ons-per-capita - doesn't work
gdf = (gdf
       >> mutate(weekday_ons_per_capita = _.weekday_ons/_.sum_total_pop)
      )

In [None]:
m_log = spreg.OLS(gdf[['weekday_ons_per_capita']].values, gdf[explanatory_vars].values, 
                  name_y = 'weekday_ons_per_capita', name_x = explanatory_vars)
print(m_log.summary)

## Spatial Weights

In [None]:
# 3 nearest neighbors
knn = weights.KNN.from_dataframe(gdf, k=3)

In [None]:
# chart indicates that errors are spatially dependent
lag_residual = weights.spatial_lag.lag_spatial(knn, m.u)
ax = sns.regplot(
    m.u.flatten(),
    lag_residual.flatten(),
    line_kws=dict(color="orangered"),
    ci=None,
)
ax.set_xlabel("Model Residuals - $u$")
ax.set_ylabel("Spatial Lag of Model (1) Residuals - $W u$");

In [None]:
lag_residual = weights.spatial_lag.lag_spatial(knn, m_log.u)
ax = sns.regplot(
    m_log.u.flatten(),
    lag_residual.flatten(),
    line_kws=dict(color="orangered"),
    ci=None,
)
ax.set_xlabel("Model Residuals - $u$")
ax.set_ylabel("Spatial Lag of Log Model Residuals - $W u$");

From context of https://geographicdata.science/book/notebooks/11_regression.html

In [None]:
# Re-weight W to 3 nearest neighbors
knn.reweight(k=3, inplace=True)
# Row standardise weights
knn.transform = "R"

In [None]:
# Run LISA on residuals
outliers = esda.moran.Moran_Local(m_log.u, knn, permutations=9999)
# Select only LISA cluster cores
error_clusters = outliers.q % 2 == 1
# Filter out non-significant clusters
error_clusters &= outliers.p_sim <= 0.001
# Add `error_clusters` and `local_I` columns
ax = (
    gdf.assign(
        error_clusters=error_clusters,
        local_I=outliers.Is
        # Retain error clusters only
    )
    .query(
        "error_clusters"
        # Sort by I value to largest plot on top
    )
    .sort_values(
        "local_I"
        # Plot I values
    )
    .plot("local_I", cmap="bwr", marker=".")
)
# Add basemap
contextily.add_basemap(ax, crs=gdf.crs)
# Remove axes
ax.set_axis_off();

And then fit the model using the OLS class in Pysal's `spreg`:

In [None]:
# Fit spatial error model with `spreg`
# (GMM estimation allowing for heteroskedasticity)
m_knn_err = spreg.GM_Error_Het(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=knn,#K=3
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_knn_err.summary)

In [36]:
# Fit spatial lag model with `spreg`
# (GMM estimation)
m_knn = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=knn,#K=3
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_knn.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :log_weekday_ons                Number of Observations:       13723
Mean dependent var  :      7.5420                Number of Variables   :          10
S.D. dependent var  :      2.2350                Degrees of Freedom    :       13713
Pseudo R-squared    :      0.5050
Spatial Pseudo R-squared:  0.4074

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.2757066       0.1964463      21.7652675       0.0000000
     n_trips_weekday       0.0154517       0.0004115      37.5534305       0.0000000
         pop_density       0.0000986       0.0000076    

In [37]:
#exponentiate coefficients to interpret
import math

m_knn_df = pd.DataFrame() 
m_knn_df['Variable']=m_knn.name_x
m_knn_df['Coefficient']=m_knn.betas
m_knn_df.dtypes

ValueError: Length of values (10) does not match length of index (9)

Stop here: below spatial weights exploration is for polygons
To do: fixed distance weights maybe with distance decay?

In [3]:
gdf.crs

NameError: name 'gdf' is not defined

In [None]:

w_dist = knn = weights.DistanceBand.from_dataframe(gdf, k=3)

In [None]:
ax = gdf.plot()
ax.set_axis_off()

In [None]:
#Rook neighbors are those states that share an edge on their respective borders:
#This shouldn't translate to stops since there aren't polygons/borders, but points?
w_rook = Rook.from_dataframe(gdf)
w_rook.n

In [None]:
w_rook.pct_nonzero

In [None]:
# Fit spatial lag model with `spreg`
# (GMM estimation)
m_rook = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=w_rook,
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_rook.summary)

In [None]:
w_queen = Queen.from_dataframe(gdf)
w_queen.n

In [None]:
m_queen = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=w_queen,
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_queen.summary)