In [1]:
import pandas as pd
import geopandas as gpd

from calitp.tables import tbl
from siuba import *
from IPython.display import Markdown

import create_coverage_data
import shared_utils
from utils import *



2 geographies:
* blocks (area, pop)
* tracts (employment)

Stop files:
* all stops
* accessible stops

RT files
* all stops, inner join with rt_complete
* accessible stops, inner join with rt_complete

1st spatial join:
1. blocks, all stops
1. blocks, accessible stops
1. blocks, all stops, rt
1. blocks, accessible stops, rt

2nd spatial join:
1. tracts, all stops
1. tracts, accessible stops
1. tracts, all stops, rt (but this is using the block file? polygon on polygon)
1. tracts, accessible stops, rt (also using block file?)

Address the polygon on polygon either with crosswalk between blocks and tracts (then use `pd.merge`, or create point geometry with stops, rt files.

In [None]:
#create_coverage_data.save_initial_data()

# Read in data from queries
ca_block_joined = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 
                                                         'block_population_joined')
rt_complete = pd.read_parquet(f"{GCS_FILE_PATH}rt_complete.parquet")
all_stops = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 
                                                   'all_stops')
accessible_stops_trips = shared_utils.utils.download_geoparquet(GCS_FILE_PATH, 
                                                                'accessible_stops_trips')

# Read in employment data by tract
tract_pop_employ_filtered = create_coverage_data.get_employment_tract_data()
#shared_utils.utils.geoparquet_gcs_export(tract_pop_employ_filtered, 
#                                         GCS_FILE_PATH, 'tract_pop_employ_filtered')

In [None]:
#sjoin_blocks, sjoin_tracts = create_coverage_data.spatial_joins_to_blocks_and_tracts()
create_coverage_data.spatial_joins_to_blocks_and_tracts()

Proportion of jobs in tracts < 4 sq km: jobs    0.605378
dtype: float64


In [None]:
for key, value in sjoin_blocks.items():
    print(key)
    print(value.shape)
    
for key, value in sjoin_tracts.items():
    print(key)
    print(value.shape)

In [None]:
keys = ["all_stops", "accessible_stops", "all_stops_rt", "accessible_stops_rt"]
for key in keys:
    df = gpd.read_parquet(f"./data/block_{key}.parquet")
    print(key)
    print(df.shape)

# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/2mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible
 
### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

### All Static

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data

## Block Level Metrics

In [None]:
by_metrics = ['block_pop', 'area'] 

for name, df in sjoin_blocks.items():
    
    header = f"{name.replace('block_', '').replace('_', ' ').title()}"
    if "Rt" in header:
        header = header.replace('Rt', 'RT')
    else:
        header = header + " Static"
    
    display(Markdown(f"### {header}"))
    
    for metric in by_metrics:
        p = calculate_access_proportion(df.drop_duplicates(
            subset=['geo_id']), ca_block_joined, metric)
        print(f"access by {metric}: {p}")

In [None]:
# Use dict to rename files to match what is already produced
# Stash intermediate parquets
rename_files = {
    "block_all_stops": "block_level_static",
    "block_accessible_stops": "block_level_accessible",
    "block_all_stops_rt": "all_stops_rt",
    "block_accessible_stops_rt": "accessible_stops_trips_rt",
}

for key, value in sjoin_blocks.items():
    new_name = rename_files[key]
    print(f"{key}: {new_name}")   
    value.to_parquet(f"./data/{key}.parquet")
    #shared_utils.utils.geoparquet_gcs_export(value, GCS_FILE_PATH, f"{new_name}")

In [None]:
for key, value in sjoin_blocks.items():
    new_name = rename_files[key]

### RT Milestone (Partial)

In [None]:
milestone_ids = [13, 121, 323, 228, 325, 56, 93, 254, 251, 81, 122, 76, 199,
 312, 344, 372, 280, 327, 112, 17, 118, 289, 120, 33, 356,
 251, 351, 232, 6, 14, 271, 296, 389, 256]

In [None]:
milestone_ids = [mid for mid in milestone_ids if mid 
                 not in sjoin_blocks["block_all_stops_rt"].calitp_itp_id.unique()]

### Export for Mapping

In [None]:
def get_status(df, new_col):
    df2 = (df.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
           >> select(_.geo_id, _.calitp_itp_id)
          )
    
    df2[new_col] = True
    
    return df2


rt_acc_status = get_status(sjoin_blocks["block_accessible_stops_rt"], "rt_accessible")
rt_status = get_status(sjoin_blocks["block_all_stops_rt"], "rt")
acc_status = get_status(sjoin_blocks["block_accessible_stops"], "accessible")
static_status = get_status(sjoin_blocks["block_all_stops"], "static")
rt_milestones = get_status((sjoin_blocks["block_all_stops"]
                            >> filter(_.calitp_itp_id.isin(milestone_ids))
                           ), "rt_milestone")

In [None]:
all_df = pd.concat([rt_acc_status, rt_status, acc_status, 
                    static_status, rt_milestones])

all_df = all_df.fillna(False)
all_df = all_df.groupby(['geo_id', 'calitp_itp_id']).any().reset_index()

In [None]:
for_mapping = (ca_block_joined.astype({'geo_id':'int64'})
 >> inner_join(_, all_df, on = ['geo_id'])
 # >> left_join(_, rt_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, acc_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, static_status, on = ['geo_id', 'calitp_itp_id'])
)
for_mapping['calitp_itp_id'] = for_mapping.calitp_itp_id.fillna(0).astype('int64')
# for_mapping = for_mapping.fillna(False)

In [None]:
for_mapping.head(3)

In [None]:
shared_utils.utils.geoparquet_gcs_export(for_mapping, GCS_FILE_PATH, 'block_accessibility_metrics')

## Employment Metrics (Tract Level)

In [None]:
## all stops employment access, jobs
metric = "num_jobs"

for key, value in sjoin_tracts.items():
    print(key)
    
    p = calculate_access_proportion(value.drop_duplicates(
        subset=['Tract']), tract_pop_employ_filtered, metric)
    
    print(f"employment access by {metric}: {p}")

### Summarizing Coverage

In [None]:
def make_coverage_summary(sjoin_blocks, sjoin_tracts, rt_df):
    ## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
    employment_summary = (sjoin_tracts["tract_all_stops"]
                          >> group_by(_.calitp_itp_id)
                          >> distinct(_.Tract, _keep_all=True) 
                          ##geo_id must be distinct per feed...
                          >> summarize(total_jobs = _.num_jobs.sum())
    )


    SQ_MI_PER_SQ_M = 3.86e-7

    coverage_summary = (sjoin_blocks["block_all_stops"]
                        >> group_by(_.calitp_itp_id)
                        >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
                        >> summarize(
                            total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
                            total_pop = _.block_pop.sum(),)
                       )
    
    # Merge together
    df = (coverage_summary 
                    >> inner_join(_, employment_summary, on ='calitp_itp_id')
                   )
    
    
    df['any_accessible_stops_trips'] = df.apply(
        lambda x: x.calitp_itp_id in (
                             sjoin_blocks["block_accessible_stops"].calitp_itp_id.unique()), axis=1)
    df['any_rt'] = df.apply(lambda x: x.calitp_itp_id in (
                             rt_df.calitp_itp_id.unique()), axis=1)
    
    df2 = (df 
           >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) 
           ## see issue 988
          )
    df2 = df2 >> inner_join(_, tbl.gtfs_schedule.agency() 
                             >> distinct(_.agency_name, _.calitp_itp_id)
                             >> collect(), 
                             on = 'calitp_itp_id')
    
    # Should this return df or df2?
    # Should this be coverage_summary that filters out certain ITP_IDs or with_name?    
    return df

In [None]:
coverage_summary = make_coverage_summary(sjoin_blocks, sjoin_tracts, rt_complete)

In [None]:
coverage_summary.to_csv('draft_feed_metrics.csv')