In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

HQTA_PATH = 'gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/export/2024-09-18/ca_hq_transit_areas.parquet'

In [None]:
processed_census_path = 'gs://calitp-analytics-data/data-analyses/sb125/quick_mts_jobs/'

In [None]:
from calitp_data_analysis.utils import geoparquet_gcs_export

# Quick counts of people and jobs within ~0.5mi of a Major Transit Stop

Just wrapped this up, attached are two csvs. One summarizes jobs and population within a half mile of the various types of Major Transit Stop per PRC 21155 and PRC 21064.3, the other additionally separates them out by transit agency.

Used 2020 Census LEHD Workplace Area Characteristics to get all primary jobs by Census block, population by block is from the 2020 decennial Census. Spatial join was a simple intersection of each half-mile buffer around a major transit stop with all Census blocks, then jobs and people were summed across all those blocks to get a single figure per major stop.


## read clean census (already done, output on GCS)

In [None]:
import gzip
# all workers, all primary jobs
with gzip.open('./ca_wac_S000_JT01_2021.csv.gz', 'rb') as f:
    df = pd.read_csv(f)

In [None]:
df = df.iloc[:, :2]

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('./DECENNIALPL2020.P1_2024-10-16T105517.zip', 'r') as myzip:
    with myzip.open('DECENNIALPL2020.P1-Data.csv') as f:
        df2 = pd.read_csv(f)

In [None]:
df2 = df2.iloc[1:,:3]

In [None]:
df2.GEO_ID = df2.GEO_ID.map(lambda x: x[10:])

In [None]:
df2.GEO_ID = df2.GEO_ID.astype(int)

In [None]:
df2.P1_001N = df2.P1_001N.astype(int)

In [None]:
df2.head(3)

In [None]:
df2.dtypes

In [None]:
df.head(3)

In [None]:
df.dtypes

In [None]:
# this will result in table having both GEO_ID and w_geocode columns with redundant data. Could instead first rename one of the columns.
joined = df2.merge(df, left_on='GEO_ID', right_on='w_geocode')

In [None]:
joined

In [None]:
hqta.head(3)

In [None]:
blocks = gpd.read_file('./tl_2020_06_tabblock20.zip')

In [None]:
blocks = blocks[['GEOID20', 'POP20', 'HOUSING20', 'geometry']]

In [None]:
blocks.GEOID20 = blocks.GEOID20.astype(int)

In [None]:
blocks = blocks.rename(columns={'GEOID20': 'GEO_ID'})

In [None]:
joined = blocks.merge(joined, on='GEO_ID')

In [None]:
joined

In [None]:
joined.rename(columns={'GEO_ID': 'geoid', 'POP20': 'pop', 'C000': 'jobs', 'HOUSING20': 'housing'})
joined = joined[['geoid', 'pop', 'jobs', 'housing', 'geometry']]

In [None]:
joined.to_parquet('census_jobs_blocks.parquet')

In [None]:
geoparquet_gcs_export(joined, processed_census_path, 'census_jobs_blocks.parquet')

## Spatial Join (start here)

In [None]:
census = gcsgp.read_parquet(f'{processed_census_path}census_jobs_blocks.parquet').rename(columns={'pop': 'people'})

In [None]:
hqta = gcsgp.read_parquet(HQTA_PATH)

In [None]:
hqta = hqta[['agency_primary', 'hqta_type', 'geometry']].query('hqta_type != "hq_corridor_bus"')

In [None]:
from calitp_data_analysis import geography_utils

In [None]:
hqta = hqta.to_crs(geography_utils.CA_NAD83Albers_m)

In [None]:
census = census.to_crs(geography_utils.CA_NAD83Albers_m)

In [None]:
sjoined = census.sjoin(hqta, how="inner", predicate='intersects') # flip direction to calculate area
sjoined = sjoined.rename(columns={'index_right': 'hqta_ix'})

In [None]:
# hqta = hqta.reset_index().rename(columns={'index':'hqta_ix'}) #  might actually have use for an ix
# sjoined = hqta.sjoin(census, how="inner", predicate='intersects')

In [None]:
from shared_utils.rt_utils import METERS_PER_MILE
sq_m_per_sq_mi = METERS_PER_MILE**2

In [None]:
sq_m_per_sq_mi

In [None]:
sjoined = sjoined.assign(area_sq_mi = sjoined['geometry'].map(lambda x: x.area / sq_m_per_sq_mi))

In [None]:
sj_geo = sjoined.copy()

In [None]:
# column 'index_right' does not exist, the previous `>> select` did surface when
# columns didn't exist
sjoined = sjoined.drop(['geometry'], axis='columns', inplace=False)

In [None]:
sjoined.head(3)

In [None]:
sjoined.drop_duplicates(subset=['geoid', 'people'], inplace=False).filter(items=['geoid', 'people']).sum(numeric_only=True)

In [None]:
# TODO figure out how to incorporate the n column based on shape[0]
stop_grouped = sjoined.groupby(
                    by=['hqta_ix', 'agency_primary', 'hqta_type'], dropna=False, group_keys=True
                ).agg(
                    total_jobs=('jobs', 'sum'),
                    total_pop=('people', 'sum'),
                    total_housing=('housing', 'sum'),
                    total_sq_mi=('area_sq_mi', 'sum')
                ).reset_index()

In [None]:
sjoined[sjoined['hqta_ix'] == 8241].sum(numeric_only=True)

In [None]:
stop_grouped.sort_values(by='total_pop', ascending=False, inplace=False)

In [None]:
stop_grouped.describe()

In [None]:
def group_calculate(df, groups, normalize=False):
    def percentile(n):
        def percentile_(x):
            return x.quantile(n)

        return percentile_


    if normalize:
        df.total_pop = df.total_pop / df.total_sq_mi
        df.total_jobs = df.total_jobs / df.total_sq_mi
        df.total_housing = df.total_housing / df.total_sq_mi

    new_df = df.groupby(by=groups, dropna=False, group_keys=True).agg(
                            p5_pop = ('total_pop', percentile(.05)),
                            p20_pop = ('total_pop', percentile(.2)),
                            p50_pop = ('total_pop', percentile(.5)),
                            p80_pop = ('total_pop', percentile(.8)),
                            p95_pop = ('total_pop', percentile(.95)),
                            p5_jobs = ('total_jobs', percentile(.05)),
                            p20_jobs = ('total_jobs', percentile(.2)),
                            p50_jobs = ('total_jobs', percentile(.5)),
                            p80_jobs = ('total_jobs', percentile(.8)),
                            p95_jobs = ('total_jobs', percentile(.95)),
                            p5_housing = ('total_housing', percentile(.05)),
                            p20_housing = ('total_housing', percentile(.2)),
                            p50_housing = ('total_housing', percentile(.5)),
                            p80_housing = ('total_housing', percentile(.8)),
                            p95_housing = ('total_housing', percentile(.95))
                        ).reset_index()
    if normalize:
        new_names = new_df.columns.map(lambda x: x+'_per_sq_mi')
        rename_dict = dict(zip(new_df.columns, new_names))
        new_df = new_df.rename(columns=rename_dict)
    return new_df.round(1)

In [None]:
by_type = group_calculate(stop_grouped, ['hqta_type'], normalize=False)
by_type

In [None]:
by_type_norm = group_calculate(stop_grouped, ['hqta_type'], normalize=True)
by_type_norm

In [None]:
by_agency = group_calculate(stop_grouped, ['hqta_type', 'agency_primary'], normalize=False)
by_agency

In [None]:
by_agency_norm = group_calculate(stop_grouped, ['hqta_type', 'agency_primary'], normalize=True)
by_agency_norm

In [None]:
# by_type.to_csv('pop_jobs_housing_by_major_stop_type.csv')
# by_type_norm.to_csv('pop_jobs_housing_by_major_stop_type_sq_mi.csv')
# by_agency.to_csv('pop_jobs_housing_by_major_stop_type_and_agency.csv')
# by_agency_norm.to_csv('pop_jobs_housing_by_major_stop_type_and_agency_sq_mi.csv')

## spot-checks

In [None]:
sjoined.geoid = sjoined.geoid.astype(str)

In [None]:
sjoined

In [None]:
import numpy as np

In [None]:
test_tract = '6037271801'

In [None]:
test = sjoined[sjoined['geoid'].astype(str).str.contains(test_tract)]

In [None]:
test.drop_duplicates(subset=['geoid'], inplace=False)

In [None]:
(test.drop_duplicates(subset=['geoid'], inplace=False)).sum()

In [None]:
3666 / 0.17768

In [None]:
sj_geo['pop_sq_mi'] = sj_geo.people / sj_geo.area_sq_mi

In [None]:
sj_geo.head(3)

In [None]:
sj_geo.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
# (sj_geo >> filter(_.hqta_ix == 38209)).explore()

In [None]:
(sj_geo.query('hqta_ix == 38209')).sum(numeric_only=True)

In [None]:
23721 / 0.896213

In [None]:
stop_dis = sj_geo.dissolve(by='hqta_ix', aggfunc=[np.sum])

In [None]:
stop_dis.columns

In [None]:
rename_dict = dict(zip(stop_dis.columns, [col[0] for col in stop_dis.columns]))
rename_dict['geometry'] = 'geometry'

In [None]:
stop_dis = stop_dis.rename(columns=rename_dict)

In [None]:
stop_dis = stop_dis.drop(['agency_primary', 'hqta_type'], axis='columns', inplace=False)

In [None]:
stop_dis.pop_sq_mi = stop_dis.people / stop_dis.area_sq_mi

In [None]:
stop_dis.pop_sq_mi.quantile(.8)

In [None]:
stop_dis.pop_sq_mi.hist()

In [None]:
# (stop_dis >> filter(_.pop_sq_mi > 50000)).explore()

In [None]:
stop_dis.pop_sq_mi.max()

In [None]:
# (stop_dis >> filter(_.pop_sq_mi > 70000)).explore()

In [None]:
stop_dis.loc[8241,:]

In [None]:
stop_dis.loc[8732:8733,:].geometry.area / sq_m_per_sq_mi

In [None]:
stop_dis.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
from shared_utils.rt_utils import show_full_df

In [None]:
stop_grouped['pop_sq_mi'] = stop_grouped.total_pop / stop_grouped.total_sq_mi

In [None]:
stop_grouped.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
# show_full_df(sjoined >> filter(_.hqta_ix == 8732))