In [13]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
import sys

from siuba import *
import pandas as pd
import geopandas as gpd
import gcsfs
import datetime as dt
import time
import shapely

from rt_analysis import rt_parser
from rt_analysis import rt_filter_map_plot

import shared_utils
from calitp_data_analysis.tables import tbls

import pyaml
import yaml

from tqdm.notebook import tqdm

In [2]:
# date for v2 testing
analysis_date = dt.date(2023, 3, 15)

In [3]:
# tbls.mart_transit_database.dim_provider_gtfs_data() >> head(3)

In [4]:
def build_speedmap_index(analysis_date: dt.date):
    '''
    An index table for tracking down a given org's schedule/rt feeds
    returns LazyTbl
    '''
    analysis_dt = dt.datetime.combine(analysis_date, dt.time(0, 0))
    
    dim_orgs = (tbls.mart_transit_database.dim_organizations()
                >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt)
                >> select(_.source_record_id, _.caltrans_district)
               )
    
    orgs_with_vp = (tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt,
              _.reports_site_assessed, _.vehicle_positions_gtfs_dataset_key != None)
    >> inner_join(_, dim_orgs, on = {'organization_source_record_id': 'source_record_id'})
    >> select(_.organization_itp_id, _.organization_name, _.organization_source_record_id,
             _.caltrans_district, _._is_current, _.vehicle_positions_gtfs_dataset_key)
    >> collect()
    )
    assert not orgs_with_vp.isnull().values.any()
    orgs_with_vp['analysis_date'] = analysis_date
    orgs_with_vp = orgs_with_vp >> distinct(_.organization_name,
                    _.organization_itp_id, _.organization_source_record_id,
                    _.caltrans_district, _._is_current, _.analysis_date,
                    _.vehicle_positions_gtfs_dataset_key)
    return orgs_with_vp

In [5]:
speedmap_index = build_speedmap_index(analysis_date)

In [6]:
# df2 >> filter(_.organization_itp_id.isin([315, 246]))
# # rail operators...

## Check/Run rt_parser

In [7]:
# vp_count = speedmap_index >> count(_.vehicle_positions_gtfs_dataset_key)

# multiples = speedmap_index >> inner_join(_, vp_count, on = 'vehicle_positions_gtfs_dataset_key') >> arrange(-_.n)

# shared_utils.rt_utils.show_full_df(multiples)

In [8]:
def check_intermediate_data(speedmap_index_df):
    
    analysis_date = speedmap_index_df.analysis_date.iloc[0]
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
    already_tried = os.path.exists(progress_path)
    if already_tried:
        print(f'found {progress_path}, resuming')
        speedmap_index_joined = pd.read_parquet(progress_path)
    else:
        ## TODO update util to use v1 or v2 folders on bucket? new util?
        operators_ran = shared_utils.rt_utils.get_operators(analysis_date,
                            speedmap_index_df.organization_itp_id.to_list())
        operators_ran_df = pd.DataFrame.from_dict(
                    operators_ran, orient='index', columns = ['status'])
        operators_ran_df.index.name = 'itp_id'
        speedmap_index_joined = speedmap_index >> inner_join(_,
                                                         operators_ran_df, on={'organization_itp_id': 'itp_id'})
    return speedmap_index_joined

In [9]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [10]:
shared_utils.rt_utils.show_full_df(speedmap_index_joined)

Unnamed: 0,organization_name,organization_itp_id,organization_source_record_id,caltrans_district,_is_current,analysis_date,vehicle_positions_gtfs_dataset_key,status
0,Nevada County,221,reczUcQgqgtMpkpKC,03 - Marysville,True,2023-03-15,00e412908245377894949d292fb79610,already_ran
1,Santa Clara Valley Transportation Authority,294,recC5CT95EufmQCXr,04 - Oakland,True,2023-03-15,03cadbffef6f3cd4ce839b218bc65e82,already_ran
2,City of Union City,350,recmB4uxrVLRXYF3L,04 - Oakland,False,2023-03-15,08c4e0555b1b90cd7464ac801157a167,already_ran
3,City of Elk Grove,105,recaJnArpFEk5QooE,03 - Marysville,True,2023-03-15,0af37e731f00a843fb9a0fe286f8d958,already_ran
4,Capitol Corridor Joint Powers Authority,56,recvEBkSBc7UxlarC,04 - Oakland,True,2023-03-15,0ba702b1ffdd7b9b8c830ae23edd197b,not_yet_run
5,Mountain View Transportation Management Associ...,217,recIeP8mUucOsbvbz,04 - Oakland,True,2023-03-15,0da5a62ebe57a31bce87abf9fbb2857a,already_ran
6,Orange County Transportation Authority,235,recfma7GNR5lQTTTg,12 - Irvine,False,2023-03-15,0f68ec4d97d179570b494c74b6fca55d,already_ran
7,Emeryville Transportation Management Agency,106,rec75nESVRqpwNiLv,04 - Oakland,True,2023-03-15,0faa34840bb65e96b7f83b7f379c2edd,already_ran
8,Anaheim Transportation Network,14,recsrIZdx5Wt6n3ol,12 - Irvine,True,2023-03-15,1051e067d18a76b0a79789f051d5c22b,already_ran
9,City of Burbank,45,rec6z2ivjTxc8Sag3,07 - Los Angeles,True,2023-03-15,12f1c1a8041f305a2aed1f39055be2d2,already_ran


In [11]:
def stage_intermediate_data(speedmap_index_joined, pbar):
    
    analysis_date = speedmap_index_joined.analysis_date.iloc[0]
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
    _df = speedmap_index_joined.copy()[['organization_itp_id', 'status']]
    _df = _df.set_index('organization_itp_id', drop = True)
    
    for itp_id in speedmap_index_joined.organization_itp_id:
    # clunky loop but ensures df saved to parquet with each run...
        
        itp_id_status = (speedmap_index_joined >> filter(
                            _.organization_itp_id == itp_id)).status.iloc[0]
        if itp_id_status != 'already_ran':
            try:
                # how to pbar in cli?
                rt_day = rt_parser.OperatorDayAnalysis(itp_id,
                                                       analysis_date, pbar)
                rt_day.export_views_gcs()
                _df.loc[itp_id, 'status'] = 'already_ran'
            except Exception as e:
                print(f'{itp_id} parser failed: {e}')
                _df.loc[itp_id, 'status'] = 'parser_failed'
            
            speedmap_index_joined = speedmap_index_joined >> select(-_.status)
            speedmap_index_joined = speedmap_index_joined >> inner_join(_,
                                        _df, on = 'organization_itp_id')
            speedmap_index_joined.to_parquet(progress_path)
    
    return  

In [14]:
pbar = tqdm()

0it [00:00, ?it/s]

In [None]:
%%capture --no-stdout
stage_intermediate_data(speedmap_index_joined, pbar)

found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_56_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_56_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_56_2023-03-15.parquet
found stops parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/stops_56_2023-03-15.parquet
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_56_2023-03-15.parquet
56 parser failed: vehicle positions empty, or vp trip ids not in schedule
found vp parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/vp_76_2023-03-15.parquet
found trips parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/trips_76_2023-03-15.parquet
found stop times parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/st_76_2023-03-15.parq

In [None]:
## iterate over itp_ids, attempt running, if failed change value to failed?
## store failures in yml? with error if any??
## proceed with build when all already_ran or failed, again print which failed

In [None]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [None]:
speedmap_index_joined

## New step: confirm able to generate an all-day map?

In [None]:
def check_map_gen(speedmap_index_joined, pbar):
    
    analysis_date = speedmap_index_joined.analysis_date.iloc[0]
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
    _df = speedmap_index_joined.copy()[['organization_itp_id', 'status']]
    _df = _df.set_index('organization_itp_id', drop = True)
    
    for itp_id in speedmap_index_joined.organization_itp_id:
            
        itp_id_status = (speedmap_index_joined >> filter(
                            _.organization_itp_id == itp_id)).status.iloc[0]
        if itp_id_status not in ('parser_failed', 'map_confirmed'):
            try:
                # how to pbar in cli?
                rt_day = rt_filter_map_plot.from_gcs(itp_id,
                                                       analysis_date, pbar)
                _m = rt_day.segment_speed_map()
                _df.loc[itp_id, 'status'] = 'map_confirmed'
            except Exception as e:
                print(f'{itp_id} map test failed: {e}')
                _df.loc[itp_id, 'status'] = 'map_failed'
            
            speedmap_index_joined = speedmap_index_joined >> select(-_.status)
            speedmap_index_joined = speedmap_index_joined >> inner_join(_,
                                        _df, on = 'organization_itp_id')
            speedmap_index_joined.to_parquet(progress_path)
            
    return

In [None]:
pbar = tqdm()

In [None]:
%%capture --no-stdout
check_map_gen(speedmap_index_joined, pbar)

## Prep portfolio yml for site

In [None]:
def stage_rt_portfolio(speedmap_index_joined,
                       input_path = '../portfolio/sites/rt.yml',
                       output_path = '../portfolio/sites/test_rt.yml'):
        
    # make sure intermediate data is ran or at least attempted
    assert speedmap_index_joined.status.isin(['already_ran', 'failed']).all()
    
    with open(input_path) as rt_site:
        rt_site_data = yaml.load(rt_site, yaml.Loader)
    
    chapters_list = []
    speedmap_index_joined = speedmap_index_joined >> arrange(_.caltrans_district)
    for district in speedmap_index_joined.caltrans_district.unique():
        if type(district) == type(None):
            continue
        chapter_dict = {}
        filtered = (speedmap_index_joined
                    >> filter(_.caltrans_district == district,
                             _.status != 'failed')
                    >> arrange(_.organization_name)
                   )
        chapter_dict['caption'] = f'District {district}'
        chapter_dict['params'] = {'district': district}
        chapter_dict['sections'] = \
            [{'itp_id': itp_id} for itp_id in filtered.organization_itp_id.to_list()]
        chapters_list += [chapter_dict]   
        
    parts_list = [{'chapters': chapters_list}]
    rt_site_data['parts'] = parts_list
    
    output = pyaml.dump(rt_site_data)
    with open(output_path, 'w') as rt_site:
        rt_site.write(output)
    
    print(f'portfolio yml staged to {output_path}')
    return

In [None]:
stage_rt_portfolio(speedmap_index_joined)

## run portfolio?

In [None]:
## os chdir ..
## run script