In [1]:
import os
import pandas as pd
import geopandas as gpd

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp_data_analysis.tables import tbls
from siuba import *
from IPython.display import Markdown

from utils import *



In [2]:
#create_coverage_data.save_initial_data()

# Read in data from queries
ca_block_joined = gpd.read_parquet(
    f"{GCS_FILE_PATH}block_population_joined.parquet")
rt_complete = pd.read_parquet(f"{GCS_FILE_PATH}rt_complete.parquet")
all_stops = gpd.read_parquet(f"{GCS_FILE_PATH}all_stops.parquet")
accessible_stops_trips = gpd.read_parquet(
    f"{GCS_FILE_PATH}accessible_stops_trips.parquet")

# Read in employment data by tract
#tract_pop_employ_filtered = create_coverage_data.get_employment_tract_data()
#shared_utils.utils.geoparquet_gcs_export(tract_pop_employ_filtered, 
#                                         GCS_FILE_PATH, 'tract_pop_employ_filtered')

tract_pop_employ_filtered = gpd.read_parquet(
    f"{GCS_FILE_PATH}tract_pop_employ_filtered.parquet")

In [3]:
# This stashes intermediate parquets into GCS
# Read it again and put in dictionary
#create_coverage_data.spatial_joins_to_blocks_and_tracts()

In [4]:
rename_block_files = {
    #"block_all_stops": "block_level_static",
    "block_accessible_stops": "block_level_accessible",
    "block_all_stops_rt": "all_stops_rt",
    "block_accessible_stops_rt": "accessible_stops_trips_rt",
}

sjoin_blocks = {}

for key, value in rename_block_files.items():
    print(key)
    sjoin_blocks[key] = gpd.read_parquet(f"{GCS_FILE_PATH}{value}.parquet")

block_accessible_stops
block_all_stops_rt
block_accessible_stops_rt


# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/2mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible
 
### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

### All Static

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data

## Block Level Metrics

In [5]:
by_metrics = ['block_pop', 'area'] 

for name, df in sjoin_blocks.items():
    
    header = f"{name.replace('block_', '').replace('_', ' ').title()}"
    if "Rt" in header:
        header = header.replace('Rt', 'RT')
    else:
        header = header + " Static"
    
    display(Markdown(f"### {header}"))
    
    for metric in by_metrics:
        p = calculate_access_proportion(df.drop_duplicates(
            subset=['geo_id']), ca_block_joined, metric)
        print(f"access by {metric}: {p}")

### Accessible Stops Static

access by block_pop: 10.02
access by area: 0.93


### All Stops RT

access by block_pop: 59.65
access by area: 5.33


### Accessible Stops RT

access by block_pop: 3.67
access by area: 0.42


In [6]:
block_level_static = gpd.read_parquet(
    f"{GCS_FILE_PATH}block_level_static.parquet")

display(Markdown(f"### All Stops Static"))

for metric in by_metrics:
    p = calculate_access_proportion(block_level_static.drop_duplicates(
        subset=['geo_id']), ca_block_joined, metric)
    print(f"access by {metric}: {p}")

### All Stops Static

access by block_pop: 85.95
access by area: 11.58


### RT Milestone (Partial)

In [7]:
milestone_ids = [13, 121, 323, 228, 325, 56, 93, 254, 251, 81, 122, 76, 199,
 312, 344, 372, 280, 327, 112, 17, 118, 289, 120, 33, 356,
 251, 351, 232, 6, 14, 271, 296, 389, 256]

In [8]:
milestone_ids = [mid for mid in milestone_ids if mid 
                 not in sjoin_blocks["block_all_stops_rt"].calitp_itp_id.unique()]

### Export for Mapping

In [11]:
def get_status(df, new_col):
    df2 = (df.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
           >> select(_.geo_id, _.calitp_itp_id)
          )
    
    df2[new_col] = True
    
    return df2


rt_acc_status = get_status(sjoin_blocks["block_accessible_stops_rt"], "rt_accessible")
rt_status = get_status(sjoin_blocks["block_all_stops_rt"], "rt")
acc_status = get_status(sjoin_blocks["block_accessible_stops"], "accessible")
static_status = get_status(block_level_static, "static")
rt_milestones = get_status((block_level_static
                            >> filter(_.calitp_itp_id.isin(milestone_ids))
                           ), "rt_milestone")

In [12]:
all_df = pd.concat([rt_acc_status, rt_status, acc_status, 
                    static_status, rt_milestones])

all_df = all_df.fillna(False)
all_df = all_df.groupby(['geo_id', 'calitp_itp_id']).any().reset_index()

In [13]:
for_mapping = (ca_block_joined.astype({'geo_id':'int64'})
 >> inner_join(_, all_df, on = ['geo_id'])
 # >> left_join(_, rt_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, acc_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, static_status, on = ['geo_id', 'calitp_itp_id'])
)
for_mapping['calitp_itp_id'] = for_mapping.calitp_itp_id.fillna(0).astype('int64')
# for_mapping = for_mapping.fillna(False)

In [14]:
for_mapping.head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop,area,calitp_itp_id,rt_accessible,rt,accessible,static,rt_milestone
0,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 258008.040 -...",103,15986.958897,269,False,True,False,True,False
1,65,42711,3025,60650427113025,"POLYGON ((259577.302 -476173.973, 259651.066 -...",42,29609.218158,269,False,True,False,True,False
2,65,43401,1023,60650434011023,"POLYGON ((281412.631 -469351.296, 281613.718 -...",21,19971.850439,13,False,False,False,True,True


In [15]:
shared_utils.utils.geoparquet_gcs_export(for_mapping, GCS_FILE_PATH, 'block_accessibility_metrics')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



## Employment Metrics (Tract Level)

In [16]:
tract_files = ["tract_all_stops", "tract_all_stops_rt",
    "tract_accessible_stops", "tract_accessible_stops_rt"
]

sjoin_tracts = {}

for t in tract_files:
    print(t)
    sjoin_tracts[t] = gpd.read_parquet(
        f"{GCS_FILE_PATH}{t}.parquet")

tract_all_stops
tract_all_stops_rt
tract_accessible_stops
tract_accessible_stops_rt


In [17]:
## all stops employment access, jobs
metric = "num_jobs"

for key, value in sjoin_tracts.items():
    print(key)
    
    p = calculate_access_proportion(value.drop_duplicates(
        subset=['Tract']), tract_pop_employ_filtered, metric)
    
    print(f"employment access by {metric}: {p}")

tract_all_stops
employment access by num_jobs: 99.32
tract_all_stops_rt
employment access by num_jobs: 82.63
tract_accessible_stops
employment access by num_jobs: 18.61
tract_accessible_stops_rt
employment access by num_jobs: 11.61


### Summarizing Coverage

In [None]:
def make_coverage_summary():
    
    tract_df = gpd.read_parquet(
        f"{GCS_FILE_PATH}tract_all_stops.parquet")
    
    ## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
    employment_summary = (tract_df
                          >> group_by(_.calitp_itp_id)
                          >> distinct(_.Tract, _keep_all=True) 
                          ##geo_id must be distinct per feed...
                          >> summarize(total_jobs = _.num_jobs.sum())
    )


    SQ_MI_PER_SQ_M = 3.86e-7
    
    block_level_static = gpd.read_parquet(
        f"{GCS_FILE_PATH}block_level_static.parquet")
    
    block_level_accessible = gpd.read_parquet(
        f"{GCS_FILE_PATH}block_level_accessible.parquet")
    
    coverage_summary = (block_level_static
                        >> group_by(_.calitp_itp_id)
                        >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
                        >> summarize(
                            total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
                            total_pop = _.block_pop.sum(),)
                       )
    
    # Merge together
    df = (coverage_summary 
                    >> inner_join(_, employment_summary, on ='calitp_itp_id')
                   )
    
    
    df['any_accessible_stops_trips'] = df.apply(
        lambda x: x.calitp_itp_id in (
                             block_level_accessible.calitp_itp_id.unique()), axis=1)
    df['any_rt'] = df.apply(lambda x: x.calitp_itp_id in (
                             rt_complete.calitp_itp_id.unique()), axis=1)
    
    df2 = (df 
           >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) 
           ## see issue 988
          )
    #df2 = df2 >> inner_join(_, tbl.gtfs_schedule.agency() 
    #                         >> distinct(_.agency_name, _.calitp_itp_id)
    #                         >> collect(), 
    #                         on = 'calitp_itp_id')
    
    # Should this return df or df2?
    # Should this be coverage_summary that filters out certain ITP_IDs or with_name?    
    return df


coverage_summary = make_coverage_summary()

In [None]:
coverage_summary.to_csv('draft_feed_metrics.csv')