In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

HQTA_PATH = 'gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/export/2024-09-18/ca_hq_transit_areas.parquet'

In [2]:
processed_census_path = 'gs://calitp-analytics-data/data-analyses/sb125/quick_mts_jobs/'

In [3]:
from calitp_data_analysis.utils import geoparquet_gcs_export

# Quick counts of people and jobs within ~0.5mi of a Major Transit Stop

Just wrapped this up, attached are two csvs. One summarizes jobs and population within a half mile of the various types of Major Transit Stop per PRC 21155 and PRC 21064.3, the other additionally separates them out by transit agency.

Used 2020 Census LEHD Workplace Area Characteristics to get all primary jobs by Census block, population by block is from the 2020 decennial Census. Spatial join was a simple intersection of each half-mile buffer around a major transit stop with all Census blocks, then jobs and people were summed across all those blocks to get a single figure per major stop.


## read clean census (already done, output on GCS)

In [None]:
import gzip
# all workers, all primary jobs
with gzip.open('./ca_wac_S000_JT01_2021.csv.gz', 'rb') as f:
    df = pd.read_csv(f)

In [None]:
df = df.iloc[:, :2]

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('./DECENNIALPL2020.P1_2024-10-16T105517.zip', 'r') as myzip:
    with myzip.open('DECENNIALPL2020.P1-Data.csv') as f:
        df2 = pd.read_csv(f)

In [None]:
df2 = df2.iloc[1:,:3]

In [None]:
df2.GEO_ID = df2.GEO_ID.map(lambda x: x[10:])

In [None]:
df2.GEO_ID = df2.GEO_ID.astype(int)

In [None]:
df2.P1_001N = df2.P1_001N.astype(int)

In [None]:
df2.head(3)

In [None]:
df2.dtypes

In [None]:
df.head(3)

In [None]:
df.dtypes

In [None]:
# this will result in table having both GEO_ID and w_geocode columns with redundant data. Could instead first rename one of the columns.
joined = df2.merge(df, left_on='GEO_ID', right_on='w_geocode')

In [89]:
joined

NameError: name 'joined' is not defined

In [None]:
hqta.head(3)

In [None]:
blocks = gpd.read_file('./tl_2020_06_tabblock20.zip')

In [None]:
blocks = blocks[['GEOID20', 'POP20', 'HOUSING20', 'geometry']]

In [111]:
blocks.GEOID20 = blocks.GEOID20.astype(int)

NameError: name 'blocks' is not defined

In [None]:
blocks = blocks.rename(columns={'GEOID20': 'GEO_ID'})

In [None]:
joined = blocks.merge(joined, on='GEO_ID')

In [None]:
joined

In [None]:
joined.rename(columns={'GEO_ID': 'geoid', 'POP20': 'pop', 'C000': 'jobs', 'HOUSING20': 'housing'})
joined = joined[['geoid', 'pop', 'jobs', 'housing', 'geometry']]

In [None]:
joined.to_parquet('census_jobs_blocks.parquet')

In [None]:
geoparquet_gcs_export(joined, processed_census_path, 'census_jobs_blocks.parquet')

## Spatial Join (start here)

In [67]:
census = gcsgp.read_parquet(f'{processed_census_path}census_jobs_blocks.parquet').rename(columns={'pop': 'people'})

In [68]:
hqta = gcsgp.read_parquet(HQTA_PATH)

In [69]:
hqta = hqta[['agency_primary', 'hqta_type', 'geometry']].query('hqta_type != "hq_corridor_bus"')

In [70]:
from calitp_data_analysis import geography_utils

In [71]:
hqta = hqta.to_crs(geography_utils.CA_NAD83Albers_m)

In [72]:
census = census.to_crs(geography_utils.CA_NAD83Albers_m)

In [73]:
sjoined = census.sjoin(hqta, how="inner", predicate='intersects') # flip direction to calculate area
sjoined = sjoined.rename(columns={'index_right': 'hqta_ix'})

In [74]:
# hqta = hqta.reset_index().rename(columns={'index':'hqta_ix'}) #  might actually have use for an ix
# sjoined = hqta.sjoin(census, how="inner", predicate='intersects')

In [75]:
from shared_utils.rt_utils import METERS_PER_MILE
sq_m_per_sq_mi = METERS_PER_MILE**2

In [76]:
sq_m_per_sq_mi

2589975.2356

In [77]:
sjoined = sjoined.assign(area_sq_mi = sjoined['geometry'].map(lambda x: x.area / sq_m_per_sq_mi))

In [78]:
sj_geo = sjoined.copy()

In [79]:
# column 'index_right' does not exist, the previous `>> select` did surface when
# columns didn't exist
sjoined = sjoined.drop(['geometry'], axis='columns', inplace=False)

In [80]:
sjoined.head(3)

Unnamed: 0,geoid,people,jobs,housing,hqta_ix,agency_primary,hqta_type,area_sq_mi
1,61130101032011,0,503,0,46899,Yolo County Transportation District,major_stop_bus,0.058746
3933,61130102011037,0,24,0,46899,Yolo County Transportation District,major_stop_bus,0.031665
4436,61130102042002,53,12,23,46899,Yolo County Transportation District,major_stop_bus,0.009574


In [81]:
sjoined.drop_duplicates(subset=['geoid', 'people'], inplace=False).filter(items=['geoid', 'people']).sum(numeric_only=True)

geoid     6217363199145780985
people               15073821
dtype: int64

In [82]:
# TODO figure out how to incorporate the n column based on shape[0]
stop_grouped = sjoined.groupby(
                    by=['hqta_ix', 'agency_primary', 'hqta_type'], dropna=False, group_keys=True
                ).agg(
                    total_jobs=('jobs', 'sum'),
                    total_pop=('people', 'sum'),
                    total_housing=('housing', 'sum'),
                    total_sq_mi=('area_sq_mi', 'sum')
                ).reset_index()

In [83]:
sjoined[sjoined['hqta_ix'] == 8241].sum(numeric_only=True)

geoid         1.221078e+16
people        7.122400e+04
jobs          2.787700e+04
housing       4.523000e+04
hqta_ix       1.656441e+06
area_sq_mi    9.501708e-01
dtype: float64

In [84]:
stop_grouped.sort_values(by='total_pop', ascending=False, inplace=False)

Unnamed: 0,hqta_ix,agency_primary,hqta_type,total_jobs,total_pop,total_housing,total_sq_mi
5672,8241,City and County of San Francisco,major_stop_bus,27877,71224,45230,0.950171
5907,8476,City and County of San Francisco,major_stop_bus,28269,70226,45748,0.917606
5673,8242,City and County of San Francisco,major_stop_bus,40235,68901,43811,0.934889
5929,8498,City and County of San Francisco,major_stop_bus,43298,68639,43643,0.938074
5927,8496,City and County of San Francisco,major_stop_bus,49917,68386,43665,0.949875
...,...,...,...,...,...,...,...
46387,48957,San Francisco International Airport,major_stop_rail,8801,0,0,2.908523
30791,33360,Los Angeles World Airports,major_stop_bus,32073,0,0,3.188587
30790,33359,Los Angeles World Airports,major_stop_bus,32073,0,0,3.188587
46388,48958,San Francisco International Airport,major_stop_rail,8801,0,0,2.908523


In [85]:
stop_grouped.describe()

Unnamed: 0,hqta_ix,total_jobs,total_pop,total_housing,total_sq_mi
count,46660.0,46660.0,46660.0,46660.0,46660.0
mean,25898.547342,16830.93174,12320.326704,5608.475439,1.20781
std,13469.804237,36277.232701,9483.377655,5853.560702,0.95201
min,2569.0,5.0,0.0,0.0,0.1556
25%,14233.75,2483.0,6258.75,2326.0,0.921196
50%,25898.5,5049.0,9988.0,3666.0,1.039169
75%,37563.25,11998.75,15475.0,6333.0,1.293047
max,49229.0,255743.0,71224.0,45748.0,153.077307


In [108]:
def group_calculate(df, groups, normalize=False):
    def percentile(n):
        def percentile_(x):
            return x.quantile(n)

        return percentile_


    if normalize:
        df.total_pop = df.total_pop / df.total_sq_mi
        df.total_jobs = df.total_jobs / df.total_sq_mi
        df.total_housing = df.total_housing / df.total_sq_mi

    new_df = df.groupby(by=groups, dropna=False, group_keys=True).agg(
                            p5_pop = ('total_pop', percentile(.05)),
                            p20_pop = ('total_pop', percentile(.2)),
                            p50_pop = ('total_pop', percentile(.5)),
                            p80_pop = ('total_pop', percentile(.8)),
                            p95_pop = ('total_pop', percentile(.95)),
                            p5_jobs = ('total_jobs', percentile(.05)),
                            p20_jobs = ('total_jobs', percentile(.2)),
                            p50_jobs = ('total_jobs', percentile(.5)),
                            p80_jobs = ('total_jobs', percentile(.8)),
                            p95_jobs = ('total_jobs', percentile(.95)),
                            p5_housing = ('total_housing', percentile(.05)),
                            p20_housing = ('total_housing', percentile(.2)),
                            p50_housing = ('total_housing', percentile(.5)),
                            p80_housing = ('total_housing', percentile(.8)),
                            p95_housing = ('total_housing', percentile(.95))
                        ).reset_index()
    if normalize:
        new_names = new_df.columns.map(lambda x: x+'_per_sq_mi')
        rename_dict = dict(zip(new_df.columns, new_names))
        new_df = new_df.rename(columns=rename_dict)
    return new_df.round(1)

In [109]:
by_type = group_calculate(stop_grouped, ['hqta_type'], normalize=False)
by_type

Unnamed: 0,hqta_type,p5_pop,p20_pop,p50_pop,p80_pop,p95_pop,p5_jobs,p20_jobs,p50_jobs,p80_jobs,p95_jobs,p5_housing,p20_housing,p50_housing,p80_housing,p95_housing
0,major_stop_brt,7201.0,14900.6,23434.5,35540.0,58089.4,1308.4,3961.4,12188.5,80472.4,204147.0,2334.8,4954.4,11311.0,21936.0,36266.9
1,major_stop_bus,2505.0,5479.0,9829.0,16966.0,27301.0,1028.0,2119.0,4878.0,14676.0,77209.0,953.0,2090.0,3606.0,7134.0,17160.0
2,major_stop_ferry,507.8,3172.0,6880.0,10316.2,11707.0,1033.4,1679.0,14020.0,123292.6,135427.7,210.4,1704.0,4143.0,6906.2,7761.0
3,major_stop_rail,2227.7,6191.0,11971.0,25451.6,56594.0,1377.7,3110.8,8994.0,45033.0,217600.0,855.3,2482.0,5220.0,15627.0,32754.0


In [110]:
by_type_norm = group_calculate(stop_grouped, ['hqta_type'], normalize=True)
by_type_norm

Unnamed: 0,hqta_type_per_sq_mi,p5_pop_per_sq_mi,p20_pop_per_sq_mi,p50_pop_per_sq_mi,p80_pop_per_sq_mi,p95_pop_per_sq_mi,p5_jobs_per_sq_mi,p20_jobs_per_sq_mi,p50_jobs_per_sq_mi,p80_jobs_per_sq_mi,p95_jobs_per_sq_mi,p5_housing_per_sq_mi,p20_housing_per_sq_mi,p50_housing_per_sq_mi,p80_housing_per_sq_mi,p95_housing_per_sq_mi
0,major_stop_brt,6640.1,15436.7,24445.2,37133.6,63053.3,1299.3,4303.8,11998.4,85633.4,218470.9,2413.7,5149.8,12596.6,22679.6,38742.2
1,major_stop_bus,1611.4,4462.0,8929.4,17290.6,29238.9,825.8,1826.6,4329.2,13040.5,84565.7,654.9,1696.9,3242.9,7230.9,18680.4
2,major_stop_ferry,421.7,2915.8,14225.9,22488.0,25607.5,669.3,2732.9,22290.7,274160.5,287714.1,194.5,1521.3,9045.7,15133.4,16976.2
3,major_stop_rail,1542.4,4661.7,12003.7,28246.4,60329.3,1160.2,2823.7,7614.2,50991.3,258973.0,524.7,1888.5,4844.0,17819.6,34478.3


In [111]:
by_agency = group_calculate(stop_grouped, ['hqta_type', 'agency_primary'], normalize=False)
by_agency

Unnamed: 0,hqta_type,agency_primary,p5_pop,p20_pop,p50_pop,p80_pop,p95_pop,p5_jobs,p20_jobs,p50_jobs,p80_jobs,p95_jobs,p5_housing,p20_housing,p50_housing,p80_housing,p95_housing
0,major_stop_brt,Alameda-Contra Costa Transit District,11384.2,13029.7,17101.5,18747.5,22483.9,1064.5,1313.9,4300.4,7434.9,72900.3,3414.5,4603.6,5160.4,8453.5,13862.3
1,major_stop_brt,City and County of San Francisco,16828.8,24270.7,30182.5,51135.6,68909.1,4340.7,8158.7,18413.6,98015.6,236992.6,8523.5,11815.6,17318.7,28692.2,41951.5
2,major_stop_brt,Los Angeles County Metropolitan Transportation...,3355.4,6342.8,13073.8,18115.8,23438.5,986.6,2049.0,5015.2,12672.1,125966.3,1295.5,2278.6,4777.3,6766.8,14945.4
3,major_stop_bus,Alameda-Contra Costa Transit District,3837.3,8035.7,12197.4,17433.2,23554.1,684.3,1377.9,3273.0,8774.5,60164.1,1517.4,3070.5,4750.7,7521.4,13572.8
4,major_stop_bus,Amador Regional Transit System,98.7,133.9,133.9,139.8,8752.9,141.0,142.2,142.2,146.2,72964.8,64.6,82.3,82.3,86.4,4404.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,major_stop_rail,San Francisco International Airport,0.0,0.0,200.9,1103.0,1540.1,2850.7,2873.6,3025.9,3954.0,4080.3,0.0,0.0,61.2,358.3,506.9
158,major_stop_rail,San Joaquin Regional Rail Commission,626.6,1957.2,4135.1,6603.9,12333.9,758.3,1390.4,4281.7,9627.7,15993.5,244.3,519.0,1642.1,2644.1,5345.9
159,major_stop_rail,Santa Clara Valley Transportation Authority,256.1,2416.4,7539.0,11574.3,15072.2,1202.0,3695.0,9838.7,24311.4,41230.6,73.0,1189.9,3157.9,5318.4,8695.6
160,major_stop_rail,Sonoma-Marin Area Rail Transit District,764.3,1846.8,3569.5,6167.1,9330.7,463.4,1074.2,2482.0,3833.8,5301.5,353.9,727.1,1579.7,2845.7,3614.0


In [112]:
by_agency_norm = group_calculate(stop_grouped, ['hqta_type', 'agency_primary'], normalize=True)
by_agency_norm

Unnamed: 0,hqta_type_per_sq_mi,agency_primary_per_sq_mi,p5_pop_per_sq_mi,p20_pop_per_sq_mi,p50_pop_per_sq_mi,p80_pop_per_sq_mi,p95_pop_per_sq_mi,p5_jobs_per_sq_mi,p20_jobs_per_sq_mi,p50_jobs_per_sq_mi,p80_jobs_per_sq_mi,p95_jobs_per_sq_mi,p5_housing_per_sq_mi,p20_housing_per_sq_mi,p50_housing_per_sq_mi,p80_housing_per_sq_mi,p95_housing_per_sq_mi
0,major_stop_brt,Alameda-Contra Costa Transit District,10542.6,13056.2,17563.1,20945.3,28295.8,1125.7,1372.9,4501.8,8073.8,91353.2,3002.5,4809.1,5428.3,9640.5,17202.4
1,major_stop_brt,City and County of San Francisco,16634.4,26624.2,33003.2,53751.4,74513.8,3913.7,9214.3,19937.6,106585.2,254559.6,7743.7,12936.4,18731.6,31047.9,46269.3
2,major_stop_brt,Los Angeles County Metropolitan Transportation...,3120.4,6085.7,14222.1,21294.7,30231.7,999.0,2104.1,4754.1,13556.5,133587.0,1204.8,2118.9,4809.0,6676.8,17633.7
3,major_stop_bus,Alameda-Contra Costa Transit District,2395.4,6745.5,12696.7,19940.5,27670.7,435.5,1257.5,3197.9,9683.5,75937.8,886.6,2593.1,5076.9,8692.7,16396.4
4,major_stop_bus,Amador Regional Transit System,14.1,18.8,18.8,19.4,10667.0,20.0,20.0,20.1,20.3,88949.1,9.2,11.6,11.6,12.0,5366.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,major_stop_rail,San Francisco International Airport,0.0,0.0,65.0,440.0,786.2,914.9,930.1,1040.4,1806.9,2065.3,0.0,0.0,19.8,147.2,258.8
158,major_stop_rail,San Joaquin Regional Rail Commission,430.4,949.7,2656.3,7853.0,13238.8,357.9,1133.4,2672.6,7719.8,18920.3,135.6,283.5,1058.5,3214.0,5925.5
159,major_stop_rail,Santa Clara Valley Transportation Authority,106.4,1463.6,6257.9,12088.5,19079.9,883.1,2522.2,7111.7,21609.1,53574.0,37.2,753.7,2385.7,5602.9,11049.7
160,major_stop_rail,Sonoma-Marin Area Rail Transit District,271.4,562.8,2318.2,4035.3,7803.2,120.7,368.8,1183.0,3221.8,3774.3,116.6,221.6,1020.7,1862.0,3019.5


In [None]:
# by_type.to_csv('pop_jobs_housing_by_major_stop_type.csv')
# by_type_norm.to_csv('pop_jobs_housing_by_major_stop_type_sq_mi.csv')
# by_agency.to_csv('pop_jobs_housing_by_major_stop_type_and_agency.csv')
# by_agency_norm.to_csv('pop_jobs_housing_by_major_stop_type_and_agency_sq_mi.csv')

## spot-checks

In [None]:
sjoined.geoid = sjoined.geoid.astype(str)

In [None]:
sjoined

In [None]:
import numpy as np

In [None]:
test_tract = '6037271801'

In [None]:
test = sjoined[sjoined['geoid'].astype(str).str.contains(test_tract)]

In [None]:
test.drop_duplicates(subset=['geoid'], inplace=False)

In [None]:
(test.drop_duplicates(subset=['geoid'], inplace=False)).sum()

In [None]:
3666 / 0.17768

In [None]:
sj_geo['pop_sq_mi'] = sj_geo.people / sj_geo.area_sq_mi

In [None]:
sj_geo.head(3)

In [None]:
sj_geo.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
# (sj_geo >> filter(_.hqta_ix == 38209)).explore()

In [None]:
(sj_geo.query('hqta_ix == 38209')).sum(numeric_only=True)

In [None]:
23721 / 0.896213

In [None]:
stop_dis = sj_geo.dissolve(by='hqta_ix', aggfunc=[np.sum])

In [None]:
stop_dis.columns

In [None]:
rename_dict = dict(zip(stop_dis.columns, [col[0] for col in stop_dis.columns]))
rename_dict['geometry'] = 'geometry'

In [None]:
stop_dis = stop_dis.rename(columns=rename_dict)

In [None]:
stop_dis = stop_dis.drop(['agency_primary', 'hqta_type'], axis='columns', inplace=False)

In [None]:
stop_dis.pop_sq_mi = stop_dis.people / stop_dis.area_sq_mi

In [None]:
stop_dis.pop_sq_mi.quantile(.8)

In [None]:
stop_dis.pop_sq_mi.hist()

In [None]:
# (stop_dis >> filter(_.pop_sq_mi > 50000)).explore()

In [None]:
stop_dis.pop_sq_mi.max()

In [None]:
# (stop_dis >> filter(_.pop_sq_mi > 70000)).explore()

In [None]:
stop_dis.loc[8241,:]

In [None]:
stop_dis.loc[8732:8733,:].geometry.area / sq_m_per_sq_mi

In [None]:
stop_dis.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
from shared_utils.rt_utils import show_full_df

In [None]:
stop_grouped['pop_sq_mi'] = stop_grouped.total_pop / stop_grouped.total_sq_mi

In [None]:
stop_grouped.sort_values(by='pop_sq_mi', ascending=False, inplace=False)

In [None]:
# show_full_df(sjoined >> filter(_.hqta_ix == 8732))