In [1]:
import os
import pandas as pd
import geopandas as gpd

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from siuba import *

import create_coverage_data
import shared_utils
from utils import *



ModuleNotFoundError: No module named 'shared_utils'

2 geographies:
* blocks (area, pop)
* tracts (employment)

Stop files:
* all stops
* accessible stops

RT files
* all stops, inner join with rt_complete
* accessible stops, inner join with rt_complete

1st spatial join:
1. blocks, all stops
1. blocks, accessible stops
1. blocks, all stops, rt
1. blocks, accessible stops, rt

2nd spatial join:
1. tracts, all stops
1. tracts, accessible stops
1. tracts, all stops, rt (but this is using the block file? polygon on polygon)
1. tracts, accessible stops, rt (also using block file?)

Address the polygon on polygon either with crosswalk between blocks and tracts (then use `pd.merge`, or create point geometry with stops, rt files.

In [None]:
#create_coverage_data.save_initial_data()

# This one requires rt_complete to be saved in GCS
#create_coverage_data.save_spatial_joined_data()

In [None]:
# Read in data from queries
ca_block_joined = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'block_population_joined')
rt_complete = pd.read_parquet(f"{GCS_FILE_PATH}rt_complete.parquet")
all_stops = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'all_stops')
accessible_stops_trips = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 'accessible_stops_trips')

# Read in employment data by tract
tract_pop_employ_filtered = create_coverage_data.get_employment_tract_data()

After reading in queries, loop through the dictionary items to create the block-stop spatial joins, and also add in rt data.

In [None]:
def spatial_join_to_stops(ca_block, stops_dfs, rt_df):
    """
    ca_block: pandas.DataFrame
        base geography file, by blocks
    stop_df: pandas.DataFrame
        all stops, stops that are accessible, etc 
    """
    
    # Store all the sjoins in this dict
    processed_dfs = {}
    
    for stop_key, stop_df in stops_dfs.items():
        # Join in GTFS schedule for all stops / accessible stops for blocks
        df = (ca_block.sjoin(stop_df, how = 'inner', predicate='intersects')
              .drop(columns = 'index_right')
              #.rename(columns = {"index_right": f"index_{stop_key}"})
             )

        if stop_key=="accessible_stops":
            ##important at block level to avoid double counts
            df = df.drop_duplicates(subset=['geo_id'])

        key = f"block_{stop_key}"
        processed_dfs[key] = df

        # Join in RT availability
        df2 = (df 
               >> inner_join(_, rt_df, 
                             on = ['calitp_itp_id', 'calitp_url_number'])
              )
            
        rt_key = f"block_{stop_key}_rt"
        processed_dfs[rt_key] = df2
                       
    return processed_dfs
    

In [None]:
stops_dfs = {
    "all_stops": all_stops,
    "accessible_stops": accessible_stops_trips,
}

sjoin_blocks = spatial_join_to_stops(ca_block_joined, stops_dfs, rt_complete)

In [None]:
for key, value in sjoin_blocks.items():
    print(key)
    #display(value.head(2))

Do the second join, which is to join tract to stops.

For tract-all_stops and tract-accessible_stops, this works.

But the `all_stops_rt` is a polygon in original notebook, because it inherits the polygon geometry of the block. Polygon on polygon is making notebook crash.

* Use a crosswalk and link block to tract (and if a block is linked to multiple tracts, keep the one with the largest area).
* Use `gpd.overlay` to get the overlay geometry and keep the one with largest area.
* With crosswalk, use `pd.merge` as intermediate file, and this should be faster.
* Alternative, is to create the point geometry in `all_stops_rt` again and use that as point-in-polygon sjoin.

In [None]:
def make_tract_block_crosswalk(block_df, tract_df):

    # Use overlay
    # See how much of block intersects with tract
    # Keep the largest intersection
    crosswalk = gpd.overlay(
        block_df[["geo_id", "geometry"]].assign(block_area = block_df.geometry.area),
        tract_df[["Tract", "geometry"]],
        how = 'intersection',
    )

    crosswalk2 = crosswalk.assign(
        overlap_area = crosswalk.geometry.area
    )
    
    crosswalk2 = (crosswalk2.sort_values(['geo_id', 'overlap_area'], 
                                     ascending=[True, False])
              .drop_duplicates(subset=['geo_id'])
              .drop(columns = ['block_area', 'overlap_area', 'geometry'])
              .reset_index(drop=True)
             )
    
    return crosswalk2

In [None]:
crosswalk = make_tract_block_crosswalk(ca_block_joined, tract_pop_employ_filtered)

In [None]:
def employment_spatial_joins(tract_employ_df, stop_dfs, crosswalk_block_tract):
    """
    tract_employ_df: pandas.DataFrame
        base geography file, by tracts
    stop_df: pandas.DataFrame
        all stops, stops that are accessible, etc 
    """
    
    # Store all the sjoins in this dict
    processed_dfs = {}    
    
    for stop_key, stop_df in stop_dfs.items():
        if "rt" not in stop_key:
            df = (tract_employ_df.sjoin(stop_df, how='inner', predicate='intersects')
                  .drop(columns = 'index_right')
                 )
            
        if "rt" in stop_key:
            # With RT data, the block geometry is included
            # Use crosswalk to merge
            
            # First, merge in crosswalk to get the block's geo_id
            df = pd.merge(
                tract_employ_df,
                crosswalk_block_tract,
                on = "Tract",
                how = "inner"
            )
            
            # Now, merge in block level data with geo_id
            df = pd.merge(df,
                          stop_df.drop(columns = ["area", "geometry"]),
                          on = 'geo_id',
                          how = 'inner'
            )
            
            # Is this still needed? Expect multiple blocks to be associated with tracts
            # No block is joined with multiple tracts (dealt with in crosswalk)
            #df = df.drop_duplicates(subset=['Tract'])
        
        key = f"tract_{stop_key.replace('block_', '')}"
        processed_dfs[key] = df
    
    return processed_dfs


In [None]:
stops_dfs2 = {
    "all_stops": all_stops,
    "accessible_stops": accessible_stops_trips,
    "all_stops_rt": sjoin_blocks["block_all_stops_rt"],
    "accessible_stops_rt": sjoin_blocks["block_accessible_stops_rt"],
}

sjoin_tracts = employment_spatial_joins(tract_pop_employ_filtered, stops_dfs2, crosswalk)

In [None]:
for key, value in sjoin_tracts.items():
    print(key)
    #display(value.head(2))

# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/2mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

## Block Level Metrics

### Accessible Static

In [None]:
calculate_access_proportion(block_level_accessible, ca_block_joined, 'block_pop')

In [None]:
# simple_map(accessible_stops_trips, 'calitp_itp_id')

### All Static

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


In [None]:
by_metrics = ['block_pop', 'area'] 

for metric in by_metrics:
    p = calculate_access_proportion(block_level_static.drop_duplicates(subset=['geo_id']),
                                ca_block_joined, metric)
    print(f"access by {metric}: {p}")

### All RT

In [None]:
for metric in by_metrics:
    p = calculate_access_proportion(all_stops_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, metric)
    print(f"access by {metric}: {p}")

### Accessible RT

In [None]:
for metric in by_metrics:
    p = calculate_access_proportion(accessible_stops_trips_rt.drop_duplicates(
        subset=['geo_id']), ca_block_joined, metric)
    print(f"access by {metric}: {p}")

### RT Milestone (Partial)

In [None]:
milestone_ids = [13, 121, 323, 228, 325, 56, 93, 254, 251, 81, 122, 76, 199,
 312, 344, 372, 280, 327, 112, 17, 118, 289, 120, 33, 356,
 251, 351, 232, 6, 14, 271, 296, 389, 256]

In [None]:
milestone_ids = [mid for mid in milestone_ids if mid 
                 not in all_stops_rt.calitp_itp_id.unique()]

### Export for Mapping

In [None]:
accessible_stops_trips_rt >> head(3)

In [None]:
def get_status(df, new_col):
    df2 = (df.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
           >> select(_.geo_id, _.calitp_itp_id)
          )
    
    df2[new_col] = True
    
    return df2


rt_acc_status = get_status(accessible_stops_trips_rt, "rt_accessible")
rt_status = get_status(all_stops_rt, "rt")
acc_status = get_status(block_level_accessible, "accessible")
static_status = get_status(block_level_static, "static")
rt_milestones = get_status((block_level_static 
                            >> filter(_.calitp_itp_id.isin(milestone_ids))
                           ), "rt_milestone")

In [None]:
all_df = pd.concat([rt_acc_status, rt_status, acc_status, static_status, rt_milestones])

In [None]:
all_df = all_df.fillna(False)
all_df = all_df.groupby(['geo_id', 'calitp_itp_id']).any().reset_index()

In [None]:
for_mapping = (ca_block_joined.astype({'geo_id':'int64'})
 >> inner_join(_, all_df, on = ['geo_id'])
 # >> left_join(_, rt_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, acc_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, static_status, on = ['geo_id', 'calitp_itp_id'])
)
for_mapping['calitp_itp_id'] = for_mapping.calitp_itp_id.fillna(0).astype('int64')
# for_mapping = for_mapping.fillna(False)

In [None]:
for_mapping.head(3)

In [None]:
#shared_utils.utils.geoparquet_gcs_export(for_mapping, GCS_FILE_PATH, 'block_accessibility_metrics')

## Employment Metrics (Tract Level)

In [None]:
def employment_spatial_joins(tract_pop_employ_filtered, all_stops, accessible_stops_trips):

    all_employment_joined = (tract_pop_employ_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left) 
    
    accessible_employment_joined = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left)
    
    
    acc_rt_employ = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips_rt >> select(-_.index_right, -_.index_left), 
                           how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )
    
        
    all_rt_employ = (tract_pop_employ_filtered
                    .sjoin(all_stops_rt >> select(-_.index_right, -_.index_left), 
                           how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )
    
    return (all_employment_joined, accessible_employment_joined, 
            acc_rt_employ, all_rt_employ)

In [None]:
(tract_pop_employ_filtered, 
 all_employment_joined, accessible_employment_joined, 
 acc_rt_employ, all_rt_employ) = employment_spatial_joins(tract_pop_employ, all_stops, accessible_stops_trips)

In [None]:
#shared_utils.utils.geoparquet_gcs_export(tract_pop_employ_filtered, GCS_FILE_PATH, 'tract_pop_employ_filtered')

In [None]:
## all stops employment access, jobs
calculate_access_proportion(all_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

In [None]:
## accessible stops employment access, jobs
calculate_access_proportion(accessible_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

In [None]:
## accessible with RT stops employment access, population
calculate_access_proportion(acc_rt_employ, tract_pop_employ_filtered, 'num_jobs')

In [None]:
## all RT stops employment access, population
calculate_access_proportion(all_rt_employ, tract_pop_employ_filtered, 'num_jobs')

### Summarizing Coverage

In [None]:
## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
employment_summary = (all_employment_joined
 >> group_by(_.calitp_itp_id)
 >> distinct(_.Tract, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_jobs = _.num_jobs.sum())
)

In [None]:
SQ_MI_PER_SQ_M = 3.86e-7

coverage_summary = (block_level_static
 >> group_by(_.calitp_itp_id)
 >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
             total_pop = _.block_pop.sum(),
             ))

In [None]:
def clean_up_summary(coverage_summary):
    df = (coverage_summary 
                        >> inner_join(_, employment_summary, on ='calitp_itp_id')
                       )
    
    df['any_accessible_stops_trips'] = df.apply(lambda x: x.calitp_itp_id in (
                             block_level_accessible.calitp_itp_id.unique()), axis=1)
    df['any_rt'] = df.apply(lambda x: x.calitp_itp_id in (
                             rt_complete.calitp_itp_id.unique()), axis=1)
    
    df2 = (df 
           >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) 
           ## see issue 988
          )
    df2 = df2 >> inner_join(_, tbl.gtfs_schedule.agency() 
                             >> distinct(_.agency_name, _.calitp_itp_id)
                             >> collect(), 
                             on = 'calitp_itp_id')
    
    return df2

In [None]:
coverage_summary = clean_up_summary(coverage_summary)

In [None]:
# with_name = with_name >> filter(_.calitp_itp_id.apply(lamb) != 1 0, _.calitp_itp_id != 8,
#                                 _.calitp_itp_id != 2, _.calitp_itp_id != 3) ## see issue 988
coverage_summary = coverage_summary >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) ## see issue 988

In [None]:
#coverage_summary.to_csv('draft_feed_metrics.csv')