In [1]:
import os
import pandas as pd
import geopandas as gpd

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from siuba import *
from IPython.display import Markdown

import create_coverage_data
import shared_utils
from utils import *



2 geographies:
* blocks (area, pop)
* tracts (employment)

Stop files:
* all stops
* accessible stops

RT files
* all stops, inner join with rt_complete
* accessible stops, inner join with rt_complete

1st spatial join:
1. blocks, all stops
1. blocks, accessible stops
1. blocks, all stops, rt
1. blocks, accessible stops, rt

2nd spatial join:
1. tracts, all stops
1. tracts, accessible stops
1. tracts, all stops, rt (but this is using the block file? polygon on polygon)
1. tracts, accessible stops, rt (also using block file?)

Address the polygon on polygon either with crosswalk between blocks and tracts (then use `pd.merge`, or create point geometry with stops, rt files.

In [None]:
#create_coverage_data.save_initial_data()

# This one requires rt_complete to be saved in GCS
#create_coverage_data.save_spatial_joined_data()

In [2]:
# Read in data from queries
ca_block_joined = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'block_population_joined')
rt_complete = pd.read_parquet(f"{GCS_FILE_PATH}rt_complete.parquet")
all_stops = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'all_stops')
accessible_stops_trips = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'accessible_stops_trips')

# Read in employment data by tract
tract_pop_employ_filtered = create_coverage_data.get_employment_tract_data()

Proportion of jobs in tracts < 4 sq km: jobs    0.605378
dtype: float64


After reading in queries, loop through the dictionary items to create the block-stop spatial joins, and also add in rt data.

In [3]:
def spatial_join_to_stops(ca_block, stops_dfs, rt_df):
    """
    ca_block: pandas.DataFrame
        base geography file, by blocks
    stop_df: pandas.DataFrame
        all stops, stops that are accessible, etc 
    """
    
    # Store all the sjoins in this dict
    processed_dfs = {}
    
    for stop_key, stop_df in stops_dfs.items():
        # Join in GTFS schedule for all stops / accessible stops for blocks
        df = (ca_block.sjoin(stop_df, how = 'inner', predicate='intersects')
              .drop(columns = 'index_right')
              #.rename(columns = {"index_right": f"index_{stop_key}"})
             )

        if stop_key=="accessible_stops":
            ##important at block level to avoid double counts
            df = df.drop_duplicates(subset=['geo_id'])

        key = f"block_{stop_key}"
        processed_dfs[key] = df

        # Join in RT availability
        df2 = (df 
               >> inner_join(_, rt_df, 
                             on = ['calitp_itp_id', 'calitp_url_number'])
              )
            
        rt_key = f"block_{stop_key}_rt"
        processed_dfs[rt_key] = df2
                       
    return processed_dfs
    

In [4]:
stops_dfs = {
    "all_stops": all_stops,
    "accessible_stops": accessible_stops_trips,
}

sjoin_blocks = spatial_join_to_stops(ca_block_joined, stops_dfs, rt_complete)

In [5]:
for key, value in sjoin_blocks.items():
    print(key)
    #display(value.head(2))

block_all_stops
block_all_stops_rt
block_accessible_stops
block_accessible_stops_rt


Do the second join, which is to join tract to stops.

For tract-all_stops and tract-accessible_stops, this works.

But the `all_stops_rt` is a polygon in original notebook, because it inherits the polygon geometry of the block. Polygon on polygon is making notebook crash.

* Use a crosswalk and link block to tract (and if a block is linked to multiple tracts, keep the one with the largest area).
* Use `gpd.overlay` to get the overlay geometry and keep the one with largest area.
* With crosswalk, use `pd.merge` as intermediate file, and this should be faster.
* Alternative, is to create the point geometry in `all_stops_rt` again and use that as point-in-polygon sjoin.

In [6]:
def make_tract_block_crosswalk(block_df, tract_df):

    # Use overlay
    # See how much of block intersects with tract
    # Keep the largest intersection
    crosswalk = gpd.overlay(
        block_df[["geo_id", "geometry"]].assign(block_area = block_df.geometry.area),
        tract_df[["Tract", "geometry"]],
        how = 'intersection',
    )

    crosswalk2 = crosswalk.assign(
        overlap_area = crosswalk.geometry.area
    )
    
    crosswalk2 = (crosswalk2.sort_values(['geo_id', 'overlap_area'], 
                                     ascending=[True, False])
              .drop_duplicates(subset=['geo_id'])
              .drop(columns = ['block_area', 'overlap_area', 'geometry'])
              .reset_index(drop=True)
             )
    
    return crosswalk2

In [7]:
crosswalk = make_tract_block_crosswalk(ca_block_joined, tract_pop_employ_filtered)

In [8]:
crosswalk2 = gpd.sjoin(ca_block_joined[["geo_id", "geometry"]],
                      tract_pop_employ_filtered[["Tract", "geometry"]],
                      how = 'inner',
                      predicate = 'intersects'
         ).drop(columns = 'index_right')

In [9]:
crosswalk2.geo_id.value_counts()

60014034011001    8
60371871022021    8
60730099021000    7
60371235101000    6
60730091021000    6
                 ..
60379010102000    1
60379011023014    1
60379010102002    1
60379010101012    1
60411220001007    1
Name: geo_id, Length: 249626, dtype: int64

In [10]:
crosswalk3 = gpd.overlay(ca_block_joined[["geo_id", "geometry"]].assign(
                            block_area = ca_block_joined.geometry.area
                        ),
                      tract_pop_employ_filtered[["Tract", "geometry"]],
                      how = 'intersection',
         )

crosswalk3 = crosswalk3.assign(
    overlap_area = crosswalk3.geometry.area
)

crosswalk3[crosswalk3.geo_id==60014034011001].sort_values(['geo_id', 'overlap_area'],
                                                         ascending=[True, False])

Unnamed: 0,geo_id,block_area,Tract,geometry,overlap_area
282969,60014034011001,228523.252674,6001403400,"MULTIPOLYGON (((-198898.146 -20471.294, -19887...",210279.7
296731,60014034011001,228523.252674,6001403702,"MULTIPOLYGON (((-197805.431 -20699.558, -19773...",8262.552
185538,60014034011001,228523.252674,6001405301,"MULTIPOLYGON (((-197897.998 -21190.413, -19790...",7427.86
311817,60014034011001,228523.252674,6001405302,"MULTIPOLYGON (((-198234.588 -21652.560, -19823...",3328.422
10287,60014034011001,228523.252674,6001405200,"POLYGON ((-197816.686 -20985.943, -197742.129 ...",373.485
101809,60014034011001,228523.252674,6001403501,"POLYGON ((-198896.123 -20462.654, -198896.123 ...",4.756912e-10
319377,60014034011001,228523.252674,6001403701,"POLYGON ((-198896.123 -20462.654, -198896.123 ...",3.834927e-10
54985,60014034011001,228523.252674,6001402900,"POLYGON ((-198896.123 -20462.654, -198896.123 ...",9.628966e-11


Spot check for crosswalk, yes, using overlap area will correctly grab the tract matching the block. Parsing the block and picking out the middle portion that is tract isn't perfect...some have just 3 characters.

In [11]:
def employment_spatial_joins(tract_employ_df, stop_dfs, crosswalk_block_tract):
    """
    tract_employ_df: pandas.DataFrame
        base geography file, by tracts
    stop_df: pandas.DataFrame
        all stops, stops that are accessible, etc 
    """
    
    # Store all the sjoins in this dict
    processed_dfs = {}    
    
    for stop_key, stop_df in stop_dfs.items():
        if "rt" not in stop_key:
            df = (tract_employ_df.sjoin(stop_df, how='inner', predicate='intersects')
                  .drop(columns = 'index_right')
                 )
            
        if "rt" in stop_key:
            # With RT data, the block geometry is included
            # Use crosswalk to merge
            
            # First, merge in crosswalk to get the block's geo_id
            df = pd.merge(
                tract_employ_df,
                crosswalk_block_tract,
                on = "Tract",
                how = "inner"
            )
            
            # Now, merge in block level data with geo_id
            df = pd.merge(df,
                          stop_df.drop(columns = ["area", "geometry"]),
                          on = 'geo_id',
                          how = 'inner'
            )
            
            # Is this still needed? Expect multiple blocks to be associated with tracts
            # No block is joined with multiple tracts (dealt with in crosswalk)
            #df = df.drop_duplicates(subset=['Tract'])
        
        key = f"tract_{stop_key.replace('block_', '')}"
        processed_dfs[key] = df
    
    return processed_dfs


In [12]:
stops_dfs2 = {
    "all_stops": all_stops,
    "accessible_stops": accessible_stops_trips,
    "all_stops_rt": sjoin_blocks["block_all_stops_rt"],
    "accessible_stops_rt": sjoin_blocks["block_accessible_stops_rt"],
}

sjoin_tracts = employment_spatial_joins(tract_pop_employ_filtered, stops_dfs2, crosswalk)

In [13]:
for key, value in sjoin_tracts.items():
    print(key)
    #display(value.head(2))

tract_all_stops
tract_accessible_stops
tract_all_stops_rt
tract_accessible_stops_rt


# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/2mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

## Block Level Metrics

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data

In [14]:
by_metrics = ['block_pop', 'area'] 

for name, df in sjoin_blocks.items():
    
    header = f"{name.replace('block_', '').replace('_', ' ').title()}"
    if "Rt" in header:
        header = header.replace('Rt', 'RT')
    else:
        header = header + " Static"
    
    display(Markdown(f"### {header}"))
    
    for metric in by_metrics:
        p = calculate_access_proportion(df.drop_duplicates(
            subset=['geo_id']), ca_block_joined, metric)
        print(f"access by {metric}: {p}")

### All Stops Static

access by block_pop: 85.95
access by area: 11.58


### All Stops RT

access by block_pop: 59.65
access by area: 5.33


### Accessible Stops Static

access by block_pop: 10.02
access by area: 0.93


### Accessible Stops RT

access by block_pop: 3.67
access by area: 0.42


### RT Milestone (Partial)

In [15]:
milestone_ids = [13, 121, 323, 228, 325, 56, 93, 254, 251, 81, 122, 76, 199,
 312, 344, 372, 280, 327, 112, 17, 118, 289, 120, 33, 356,
 251, 351, 232, 6, 14, 271, 296, 389, 256]

In [16]:
milestone_ids = [mid for mid in milestone_ids if mid 
                 not in sjoin_blocks["block_all_stops_rt"].calitp_itp_id.unique()]

### Export for Mapping

In [17]:
def get_status(df, new_col):
    df2 = (df.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
           >> select(_.geo_id, _.calitp_itp_id)
          )
    
    df2[new_col] = True
    
    return df2


rt_acc_status = get_status(sjoin_blocks["block_accessible_stops_rt"], "rt_accessible")
rt_status = get_status(sjoin_blocks["block_all_stops_rt"], "rt")
acc_status = get_status(sjoin_blocks["block_accessible_stops"], "accessible")
static_status = get_status(sjoin_blocks["block_all_stops"], "static")
rt_milestones = get_status((sjoin_blocks["block_all_stops"]
                            >> filter(_.calitp_itp_id.isin(milestone_ids))
                           ), "rt_milestone")

In [18]:
all_df = pd.concat([rt_acc_status, rt_status, acc_status, static_status, rt_milestones])

In [19]:
all_df = all_df.fillna(False)
all_df = all_df.groupby(['geo_id', 'calitp_itp_id']).any().reset_index()

In [20]:
for_mapping = (ca_block_joined.astype({'geo_id':'int64'})
 >> inner_join(_, all_df, on = ['geo_id'])
 # >> left_join(_, rt_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, acc_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, static_status, on = ['geo_id', 'calitp_itp_id'])
)
for_mapping['calitp_itp_id'] = for_mapping.calitp_itp_id.fillna(0).astype('int64')
# for_mapping = for_mapping.fillna(False)

In [21]:
DATA_PATH = "./data/"
for_mapping.to_parquet(f"{DATA_PATH}block_accessibility_metrics.parquet")


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



In [22]:
# These checks pass - calculated access proportion the same at aggregate level (above)
# sjoins are also the same
def compare_block_accessibility(DATA_PATH):
    df = gpd.read_parquet(f"{DATA_PATH}block_accessibility_metrics.parquet")
    print(f"# rows: {len(df)}")
    
    cols = ["rt_accessible", "rt", "accessible", "static", "rt_milestone"]

    for c in cols:
        subset = df[df[c]==True]
        print(f"{c} is True: {len(subset)}")
    
    #return df

In [23]:
compare_block_accessibility(GCS_FILE_PATH)

# rows: 854636
rt_accessible is True: 13913
rt is True: 390398
accessible is True: 35049
static is True: 854636
rt_milestone is True: 104476


In [24]:
compare_block_accessibility(DATA_PATH)

# rows: 854636
rt_accessible is True: 13913
rt is True: 390398
accessible is True: 35049
static is True: 854636
rt_milestone is True: 104476


In [25]:
for_mapping.head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop,area,calitp_itp_id,rt_accessible,rt,accessible,static,rt_milestone
0,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 258008.040 -...",103,15986.958897,269,False,True,False,True,False
1,65,42711,3025,60650427113025,"POLYGON ((259577.302 -476173.973, 259651.066 -...",42,29609.218158,269,False,True,False,True,False
2,65,43401,1023,60650434011023,"POLYGON ((281412.631 -469351.296, 281613.718 -...",21,19971.850439,13,False,False,False,True,True


In [None]:
#shared_utils.utils.geoparquet_gcs_export(for_mapping, GCS_FILE_PATH, 'block_accessibility_metrics')

## Employment Metrics (Tract Level)

In [26]:
## all stops employment access, jobs
metric = "num_jobs"

for key, value in sjoin_tracts.items():
    print(key)
    
    p = calculate_access_proportion(value.drop_duplicates(
        subset=['Tract']), tract_pop_employ_filtered, metric)
    
    print(f"employment access by {metric}: {p}")

tract_all_stops
employment access by num_jobs: 99.32
tract_accessible_stops
employment access by num_jobs: 18.61
tract_all_stops_rt
employment access by num_jobs: 82.63
tract_accessible_stops_rt
employment access by num_jobs: 11.61


In [None]:
#shared_utils.utils.geoparquet_gcs_export(tract_pop_employ_filtered, GCS_FILE_PATH, 'tract_pop_employ_filtered')

### Summarizing Coverage

In [38]:
def make_coverage_summary(sjoin_blocks, sjoin_tracts, rt_df):
    ## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
    employment_summary = (sjoin_tracts["tract_all_stops"]
                          >> group_by(_.calitp_itp_id)
                          >> distinct(_.Tract, _keep_all=True) 
                          ##geo_id must be distinct per feed...
                          >> summarize(total_jobs = _.num_jobs.sum())
    )


    SQ_MI_PER_SQ_M = 3.86e-7

    coverage_summary = (sjoin_blocks["block_all_stops"]
                        >> group_by(_.calitp_itp_id)
                        >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
                        >> summarize(total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
                                     total_pop = _.block_pop.sum(),)
                       )
    
    # Merge together
    df = (coverage_summary 
                    >> inner_join(_, employment_summary, on ='calitp_itp_id')
                   )
    
    
    df['any_accessible_stops_trips'] = df.apply(
        lambda x: x.calitp_itp_id in (
                             sjoin_blocks["block_accessible_stops"].calitp_itp_id.unique()), axis=1)
    df['any_rt'] = df.apply(lambda x: x.calitp_itp_id in (
                             rt_df.calitp_itp_id.unique()), axis=1)
    
    df2 = (df 
           >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) 
           ## see issue 988
          )
    df2 = df2 >> inner_join(_, tbl.gtfs_schedule.agency() 
                             >> distinct(_.agency_name, _.calitp_itp_id)
                             >> collect(), 
                             on = 'calitp_itp_id')
    
    # Should this return df or df2?
    # Should this be coverage_summary that filters out certain ITP_IDs or with_name?
    # returning df gives more similar observations to Eric's
    # differences come because the spatial join for accessible_stop_trips is slightly different with the crosswalk
    return df

In [39]:
coverage_summary = make_coverage_summary(sjoin_blocks, sjoin_tracts, rt_complete)

In [30]:
coverage_summary.to_parquet(f"{DATA_PATH}draft_feed_metrics.parquet")

In [None]:
# with_name = with_name >> filter(_.calitp_itp_id.apply(lamb) != 1 0, _.calitp_itp_id != 8,
#                                 _.calitp_itp_id != 2, _.calitp_itp_id != 3) ## see issue 988

#coverage_summary = coverage_summary >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) ## see issue 988

In [None]:
#coverage_summary.to_csv('draft_feed_metrics.csv')

In [31]:
eric = pd.read_csv("./draft_feed_metrics.csv")

In [42]:
def compare_coverage_summary(df):
    
    print(f"# rows: {len(df)}")
    
    cols = ["any_accessible_stops_trips", "any_rt"]
    for c in cols:
        subset = df[df[c]== True]
        print(f"{c}: {len(subset)}")


In [43]:
compare_coverage_summary(eric)

# rows: 183
any_accessible_stops_trips: 13
any_rt: 61


In [44]:
compare_coverage_summary(coverage_summary)

# rows: 190
any_accessible_stops_trips: 14
any_rt: 62
