In [None]:
#https://geographicdata.science/book/notebooks/11_regression.html

In [None]:
!pip install pysal contextily

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
#https://pysal.org/libpysal/notebooks/weights.html
from libpysal.weights import Queen, Rook, KNN

from pysal.lib import weights
from pysal.explore import esda
from pysal.model import spreg

from shapely import wkb
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

import contextily
import geopandas
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
gdf = geopandas.read_parquet('gs://calitp-analytics-data/data-analyses/ahsc_grant/analytical_tbl.parquet')
gdf = gdf.fillna(0)
gdf

In [None]:
gdf['weekday_ons'].describe()

In [None]:
sns.histplot(gdf['weekday_ons'])

In [None]:
sns.histplot(np.log(gdf['weekday_ons']))

In [None]:
gdf.columns

In [None]:
# Multiply all percentage columns by 100 for easier OLS interpretation
for column in gdf.columns:
    if 'pct' in column:
        gdf[column] = gdf[column] * 100
gdf.head()

In [None]:
# Weekday_ons ~ n_trips_am_peak + n_trips_midday + n_trips_pmpeak + pop_density + job_density + pct: 
# Intuition is to only use use pct variables because we don’t want to double count people in areas around multiple stops. Whereas, trips are characteristic of the stop itself

explanatory_vars = ['n_trips_ampeak','n_trips_midday', 'n_trips_pmpeak',
       'pop_density', 'job_density', 'pct_not_us_citizen_pop', 'pct_black_pop', 'pct_inc_extremelylow', 'pct_inc_verylow',
       'pct_inc_low', 'pct_poverty', 'pct_pop_workers_no_car']


In [None]:
m = spreg.OLS(gdf[['weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'weekday_ons', name_x = explanatory_vars)
print(m.summary)

In [None]:
gdf['log_weekday_ons'] = np.log(gdf['weekday_ons'])
gdf['log_weekday_ons'] = gdf['log_weekday_ons'].replace(np.NINF, 0) #replace undefined values produced from np.log(0) w/ 0

In [None]:
gdf['log_weekday_ons'].replace(np.NINF, 0).value_counts()

In [None]:
m_log = spreg.OLS(gdf[['log_weekday_ons']].values, gdf[explanatory_vars].values, 
                  name_y = 'log_weekday_ons', name_x = explanatory_vars)
print(m_log.summary)

In [None]:
knn = weights.KNN.from_dataframe(gdf, k=3)

In [None]:
lag_residual = weights.spatial_lag.lag_spatial(knn, m.u)
ax = sns.regplot(
    m.u.flatten(),
    lag_residual.flatten(),
    line_kws=dict(color="orangered"),
    ci=None,
)
ax.set_xlabel("Model Residuals - $u$")
ax.set_ylabel("Spatial Lag of Model (1) Residuals - $W u$");

In [None]:
lag_residual = weights.spatial_lag.lag_spatial(knn, m_log.u)
ax = sns.regplot(
    m_log.u.flatten(),
    lag_residual.flatten(),
    line_kws=dict(color="orangered"),
    ci=None,
)
ax.set_xlabel("Model Residuals - $u$")
ax.set_ylabel("Spatial Lag of Model (2) Residuals - $W u$");

From context of https://geographicdata.science/book/notebooks/11_regression.html

In [None]:
# Re-weight W to 3 nearest neighbors
knn.reweight(k=3, inplace=True)
# Row standardise weights
knn.transform = "R"
# Run LISA on residuals
outliers = esda.moran.Moran_Local(m_log.u, knn, permutations=9999)
# Select only LISA cluster cores
error_clusters = outliers.q % 2 == 1
# Filter out non-significant clusters
error_clusters &= outliers.p_sim <= 0.001
# Add `error_clusters` and `local_I` columns
ax = (
    gdf.assign(
        error_clusters=error_clusters,
        local_I=outliers.Is
        # Retain error clusters only
    )
    .query(
        "error_clusters"
        # Sort by I value to largest plot on top
    )
    .sort_values(
        "local_I"
        # Plot I values
    )
    .plot("local_I", cmap="bwr", marker=".")
)
# Add basemap
contextily.add_basemap(ax, crs=gdf.crs)
# Remove axes
ax.set_axis_off();

And then fit the model using the OLS class in Pysal's `spreg`:

In [None]:
# Fit spatial error model with `spreg`
# (GMM estimation allowing for heteroskedasticity)
m_knn_err = spreg.GM_Error_Het(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=knn,#K=3
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_knn_err.summary)

In [None]:
# Fit spatial lag model with `spreg`
# (GMM estimation)
m_knn = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=knn,#K=3
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_knn.summary)

In [None]:
ax = gdf.plot()
ax.set_axis_off()

In [None]:
#Rook neighbors are those states that share an edge on their respective borders:
#This shouldn't translate to stops since there aren't polygons/borders, but points?
w_rook = Rook.from_dataframe(gdf)
w_rook.n

In [None]:
w_rook.pct_nonzero

In [None]:
# Fit spatial lag model with `spreg`
# (GMM estimation)
m_rook = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=w_rook,
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_rook.summary)

In [None]:
w_queen = Queen.from_dataframe(gdf)
w_queen.n

In [None]:
m_queen = spreg.GM_Lag(
    # Dependent variable
    gdf[["log_weekday_ons"]].values,
    # Independent variables
    gdf[explanatory_vars].values,
    # Spatial weights matrix
    w=w_queen,
    # Dependent variable name
    name_y="log_weekday_ons",
    # Independent variables names
    name_x=explanatory_vars,
)
print(m_queen.summary)