In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
import sys

from siuba import *
import pandas as pd
import geopandas as gpd
import gcsfs
import datetime as dt
import time
import shapely

from rt_analysis import rt_parser
from rt_analysis import rt_filter_map_plot

import shared_utils
from calitp_data_analysis.tables import tbls

import pyaml
import yaml

from tqdm.notebook import tqdm



In [2]:
# date for v2 testing
analysis_date = dt.date(2023, 3, 15)

In [3]:
# tbls.mart_transit_database.dim_provider_gtfs_data() >> head(3)

In [29]:
def build_speedmap_index(analysis_date: dt.date):
    '''
    An index table for tracking down a given org's schedule/rt feeds
    returns LazyTbl
    '''
    analysis_dt = dt.datetime.combine(analysis_date, dt.time(0, 0))
    
    dim_orgs = (tbls.mart_transit_database.dim_organizations()
                >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt)
                >> select(_.source_record_id, _.caltrans_district)
               )
    
    orgs_with_vp = (tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt,
              _.reports_site_assessed, _.vehicle_positions_gtfs_dataset_key != None)
    >> inner_join(_, dim_orgs, on = {'organization_source_record_id': 'source_record_id'})
    >> select(_.organization_itp_id, _.organization_name, _.organization_source_record_id,
             _.caltrans_district, _._is_current, _.vehicle_positions_gtfs_dataset_key)
    >> collect()
    )
    assert not orgs_with_vp.isnull().values.any()
    orgs_with_vp['analysis_date'] = analysis_date
    orgs_with_vp = orgs_with_vp >> distinct(_.organization_name,
                    _.organization_itp_id, _.organization_source_record_id,
                    _.caltrans_district, _._is_current, _.analysis_date
                    # ,_.vehicle_positions_gtfs_dataset_key
                                           )
    return orgs_with_vp

In [30]:
speedmap_index = build_speedmap_index(analysis_date)

In [31]:
# df2 >> filter(_.organization_itp_id.isin([315, 246]))
# # rail operators...

In [32]:
speedmap_index >> filter(_.organization_itp_id == 4)

Unnamed: 0,organization_name,organization_itp_id,organization_source_record_id,caltrans_district,_is_current,analysis_date
36,Alameda-Contra Costa Transit District,4,recOZgevYf7Jimm9L,04 - Oakland,False,2023-03-15


## Check/Run rt_parser

In [33]:
# vp_count = speedmap_index >> count(_.vehicle_positions_gtfs_dataset_key)

# multiples = speedmap_index >> inner_join(_, vp_count, on = 'vehicle_positions_gtfs_dataset_key') >> arrange(-_.n)

# shared_utils.rt_utils.show_full_df(multiples)

In [34]:
def check_intermediate_data(speedmap_index_df):
    
    analysis_date = speedmap_index_df.analysis_date.iloc[0]
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
    already_tried = os.path.exists(progress_path)
    if already_tried:
        print(f'found {progress_path}, resuming')
        speedmap_index_joined = pd.read_parquet(progress_path)
    else:
        ## TODO update util to use v1 or v2 folders on bucket? new util?
        operators_ran = shared_utils.rt_utils.get_operators(analysis_date,
                            speedmap_index_df.organization_itp_id.to_list())
        operators_ran_df = pd.DataFrame.from_dict(
                    operators_ran, orient='index', columns = ['status'])
        operators_ran_df.index.name = 'itp_id'
        speedmap_index_joined = speedmap_index >> inner_join(_,
                                                         operators_ran_df, on={'organization_itp_id': 'itp_id'})
    return speedmap_index_joined

In [35]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [36]:
# shared_utils.rt_utils.show_full_df(speedmap_index_joined)

In [37]:
speedmap_index_joined

Unnamed: 0,organization_name,organization_itp_id,organization_source_record_id,caltrans_district,_is_current,analysis_date,status
0,Nevada County,221,reczUcQgqgtMpkpKC,03 - Marysville,True,2023-03-15,already_ran
1,Santa Clara Valley Transportation Authority,294,recC5CT95EufmQCXr,04 - Oakland,True,2023-03-15,already_ran
2,City of Union City,350,recmB4uxrVLRXYF3L,04 - Oakland,False,2023-03-15,already_ran
3,City of Elk Grove,105,recaJnArpFEk5QooE,03 - Marysville,True,2023-03-15,already_ran
4,Capitol Corridor Joint Powers Authority,56,recvEBkSBc7UxlarC,04 - Oakland,True,2023-03-15,not_yet_run
...,...,...,...,...,...,...,...
95,Antelope Valley Transit Authority,16,recxsWR0KRrQTdjmg,07 - Los Angeles,True,2023-03-15,already_ran
96,City and County of San Francisco,282,rechaapWbeffO33OX,04 - Oakland,False,2023-03-15,already_ran
97,City of Solvang,312,reckp33bhAuZlmO1M,05 - San Luis Obispo,False,2023-03-15,not_yet_run
98,San Joaquin Regional Transit District,284,recZgWVXkpix390of,10 - Stockton,False,2023-03-15,already_ran


In [44]:
def stage_intermediate_data(row, pbar):
    
    analysis_date = row.analysis_date
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
        
    if row.status != 'already_ran':
        try:
            rt_day = rt_parser.OperatorDayAnalysis(row.organization_itp_id,
                                                   analysis_date, pbar)
            rt_day.export_views_gcs()
            row.status = 'already_ran'
        except Exception as e:
            print(f'{row.organization_itp_id} parser failed: {e}')
            row.status = 'parser_failed'

        speedmap_index_joined.to_parquet(progress_path)
    
    return  

In [45]:
pbar = tqdm()

0it [00:00, ?it/s]

In [46]:
%%capture --no-stdout
_ = speedmap_index_joined.apply(stage_intermediate_data, axis = 1, args=[pbar])

found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_56_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_56_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_56_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_56_2023-03-15.parquet
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_56_2023-03-15.parquet
56 parser failed: vehicle positions empty, or vp trip ids not in schedule
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_76_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_76_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_76_2023-03-15.parq



273 parser failed: shapes must not be empty
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_489_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_489_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_489_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_489_2023-03-15.parquet




489 parser failed: shapes must not be empty
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_372_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_372_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_372_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_372_2023-03-15.parquet
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_372_2023-03-15.parquet
372 parser failed: vehicle positions empty, or vp trip ids not in schedule
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_107_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_107_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-ana



485 parser failed: shapes must not be empty
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_481_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_481_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_481_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_481_2023-03-15.parquet
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_481_2023-03-15.parquet
481 parser failed: vehicle positions empty, or vp trip ids not in schedule
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_356_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_356_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-ana



264 parser failed: shapes must not be empty
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_312_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_312_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_312_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_312_2023-03-15.parquet




312 parser failed: shapes must not be empty


0     None
1     None
2     None
3     None
4     None
      ... 
95    None
96    None
97    None
98    None
99    None
Length: 100, dtype: object

In [14]:
## iterate over itp_ids, attempt running, if failed change value to failed?
## store failures in yml? with error if any??
## proceed with build when all already_ran or failed, again print which failed

In [48]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

found ./_rt_progress_2023-03-15.parquet, resuming


In [49]:
speedmap_index_joined >> filter(_.organization_itp_id == 4)

Unnamed: 0,organization_name,organization_itp_id,organization_source_record_id,caltrans_district,_is_current,analysis_date,status
36,Alameda-Contra Costa Transit District,4,recOZgevYf7Jimm9L,04 - Oakland,False,2023-03-15,already_ran


## New step: confirm able to generate an all-day map?

In [57]:
def check_map_gen(row, pbar):
    
    analysis_date = row.analysis_date
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
        
    if row.status not in ('parser_failed', 'map_confirmed'):
        try:
            rt_day = rt_filter_map_plot.from_gcs(row.organization_itp_id,
                                                       analysis_date, pbar)
            _m = rt_day.segment_speed_map()
            row.status = 'map_confirmed'
        except Exception as e:
            print(f'{row.organization_itp_id} map test failed: {e}')
            row.status = 'map_failed'

        speedmap_index_joined.to_parquet(progress_path)
    
    return  

In [59]:
pbar = tqdm()

0it [00:00, ?it/s]

In [None]:
%%capture --no-stdout
_ = speedmap_index_joined.apply(check_map_gen, axis = 1, args=[pbar])

found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_221_2023-03-15.parquet
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_294_2023-03-15.parquet


## Prep portfolio yml for site

In [None]:
def stage_rt_portfolio(speedmap_index_joined,
                       input_path = '../portfolio/sites/rt.yml',
                       output_path = '../portfolio/sites/test_rt.yml'):
        
    # make sure intermediate data is ran or at least attempted
    assert speedmap_index_joined.status.isin(['already_ran', 'failed']).all()
    
    with open(input_path) as rt_site:
        rt_site_data = yaml.load(rt_site, yaml.Loader)
    
    chapters_list = []
    speedmap_index_joined = speedmap_index_joined >> arrange(_.caltrans_district)
    for district in speedmap_index_joined.caltrans_district.unique():
        if type(district) == type(None):
            continue
        chapter_dict = {}
        filtered = (speedmap_index_joined
                    >> filter(_.caltrans_district == district,
                             _.status != 'failed')
                    >> arrange(_.organization_name)
                   )
        chapter_dict['caption'] = f'District {district}'
        chapter_dict['params'] = {'district': district}
        chapter_dict['sections'] = \
            [{'itp_id': itp_id} for itp_id in filtered.organization_itp_id.to_list()]
        chapters_list += [chapter_dict]   
        
    parts_list = [{'chapters': chapters_list}]
    rt_site_data['parts'] = parts_list
    
    output = pyaml.dump(rt_site_data)
    with open(output_path, 'w') as rt_site:
        rt_site.write(output)
    
    print(f'portfolio yml staged to {output_path}')
    return

In [None]:
stage_rt_portfolio(speedmap_index_joined)

## run portfolio?

In [None]:
## os chdir ..
## run script