In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
import sys

from siuba import *
import pandas as pd
import geopandas as gpd
import gcsfs
import datetime as dt
import time
import shapely

from rt_analysis import rt_parser
from rt_analysis import rt_filter_map_plot

import shared_utils
from calitp_data_analysis.tables import tbls

import pyaml
import yaml

from tqdm.notebook import tqdm


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
shared_utils.rt_utils.check_intermediate_data()

AttributeError: 'DataFrame' object has no attribute 'analysis_date'

In [None]:
# date for v2 testing
analysis_date = dt.date(2023, 3, 15)

In [4]:
# tbls.mart_transit_database.dim_provider_gtfs_data() >> head(3)

In [5]:
def build_speedmap_index(analysis_date: dt.date):
    '''
    An index table for tracking down a given org's schedule/rt feeds
    returns LazyTbl
    '''
    analysis_dt = dt.datetime.combine(analysis_date, dt.time(0, 0))
    
    dim_orgs = (tbls.mart_transit_database.dim_organizations()
                >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt)
                >> select(_.source_record_id, _.caltrans_district)
               )
    
    orgs_with_vp = (tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt,
              _.reports_site_assessed, _.vehicle_positions_gtfs_dataset_key != None)
    >> inner_join(_, dim_orgs, on = {'organization_source_record_id': 'source_record_id'})
    >> select(_.organization_itp_id, _.organization_name, _.organization_source_record_id,
             _.caltrans_district, _._is_current, _.vehicle_positions_gtfs_dataset_key)
    >> collect()
    )
    assert not orgs_with_vp.isnull().values.any()
    orgs_with_vp['analysis_date'] = analysis_date
    orgs_with_vp = orgs_with_vp >> distinct(_.organization_name,
                    _.organization_itp_id, _.organization_source_record_id,
                    _.caltrans_district, _._is_current, _.analysis_date
                    # ,_.vehicle_positions_gtfs_dataset_key
                                           )
    return orgs_with_vp

In [6]:
speedmap_index = build_speedmap_index(analysis_date)

In [7]:
type(speedmap_index.iloc[0])

pandas.core.series.Series

In [None]:
# df2 >> filter(_.organization_itp_id.isin([315, 246]))
# # rail operators...

In [None]:
speedmap_index >> filter(_.organization_itp_id == 4)

## Check/Run rt_parser

In [None]:
# vp_count = speedmap_index >> count(_.vehicle_positions_gtfs_dataset_key)

# multiples = speedmap_index >> inner_join(_, vp_count, on = 'vehicle_positions_gtfs_dataset_key') >> arrange(-_.n)

# shared_utils.rt_utils.show_full_df(multiples)

In [1]:
def check_intermediate_data(speedmap_index_df: pd.DataFrame) -> pd.DataFrame:
    '''
    For speedmap generation scripts in rt_delay.
    Check if intermediate file exists (process partially complete) and 
    return that to script, otherwise check intermediate data from GCS
    '''
    analysis_date = speedmap_index_df.analysis_date.iloc[0]
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
    already_tried = os.path.exists(progress_path)
    if already_tried:
        print(f'found {progress_path}, resuming')
        speedmap_index_joined = pd.read_parquet(progress_path)
    else:
        operators_ran = shared_utils.rt_utils.get_operators(analysis_date,
                            speedmap_index_df.organization_itp_id.to_list())
        operators_ran_df = pd.DataFrame.from_dict(
                    operators_ran, orient='index', columns = ['status'])
        operators_ran_df.index.name = 'itp_id'
        speedmap_index_joined = speedmap_index >> inner_join(_,
                                                         operators_ran_df, on={'organization_itp_id': 'itp_id'})
    return speedmap_index_joined

NameError: name 'pd' is not defined

In [None]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [None]:
# shared_utils.rt_utils.show_full_df(speedmap_index_joined)

In [None]:
speedmap_index_joined >> count(_.status)

In [None]:
def stage_intermediate_data(row, pbar):
    
    global speedmap_index_joined
    analysis_date = row.analysis_date
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
        
    if row.status != 'already_ran':
        try:
            rt_day = rt_parser.OperatorDayAnalysis(row.organization_itp_id,
                                                   analysis_date, pbar)
            rt_day.export_views_gcs()
            row.status = 'already_ran'
        except Exception as e:
            print(f'{row.organization_itp_id} parser failed: {e}')
            row.status = 'parser_failed'
        print(row.name)
        speedmap_index_joined.loc[row.name] = row
        speedmap_index_joined.to_parquet(progress_path)
    
    return  

In [None]:
pbar = tqdm()

In [None]:
%%capture --no-stdout
_ = speedmap_index_joined.apply(stage_intermediate_data, axis = 1, args=[pbar])

In [None]:
## iterate over itp_ids, attempt running, if failed change value to failed?
## store failures in yml? with error if any??
## proceed with build when all already_ran or failed, again print which failed

In [None]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [None]:
speedmap_index_joined >> filter(_.organization_itp_id == 4)

## New step: confirm able to generate an all-day map?

In [None]:
def check_map_gen(row, pbar):
    global speedmap_index_joined
    analysis_date = row.analysis_date
    progress_path = f'./_rt_progress_{analysis_date}.parquet'
        
    if row.status not in ('parser_failed', 'map_confirmed'):
        try:
            rt_day = rt_filter_map_plot.from_gcs(row.organization_itp_id,
                                                       analysis_date, pbar)
            _m = rt_day.segment_speed_map()
            row.status = 'map_confirmed'
        except Exception as e:
            print(f'{row.organization_itp_id} map test failed: {e}')
            row.status = 'map_failed'
        speedmap_index_joined.loc[row.name] = row
        speedmap_index_joined.to_parquet(progress_path)
    
    return  

In [None]:
pbar = tqdm()

In [None]:
%%capture --no-stdout
_ = speedmap_index_joined.apply(check_map_gen, axis = 1, args=[pbar])

In [None]:
speedmap_index_joined = check_intermediate_data(speedmap_index)

In [None]:
shared_utils.rt_utils.show_full_df(speedmap_index_joined)

## Prep portfolio yml for site

In [None]:
def stage_rt_portfolio(speedmap_index_joined,
                       input_path = '../portfolio/sites/rt.yml',
                       output_path = '../portfolio/sites/test_rt.yml'):
        
    # make sure intermediate data is ran or at least attempted
    assert speedmap_index_joined.status.isin(['map_confirmed',
                                              'parser_failed', 'map_failed']).all()
    
    with open(input_path) as rt_site:
        rt_site_data = yaml.load(rt_site, yaml.Loader)
    
    chapters_list = []
    speedmap_index_joined = speedmap_index_joined >> arrange(_.caltrans_district)
    for district in speedmap_index_joined.caltrans_district.unique():
        if type(district) == type(None):
            continue
        chapter_dict = {}
        filtered = (speedmap_index_joined
                    >> filter(_.caltrans_district == district,
                             -_.status.isin(['parser_failed', 'map_failed']))
                    >> arrange(_.organization_name)
                   )
        chapter_dict['caption'] = f'District {district}'
        chapter_dict['params'] = {'district': district}
        chapter_dict['sections'] = \
            [{'itp_id': itp_id} for itp_id in filtered.organization_itp_id.to_list()]
        chapters_list += [chapter_dict]   
        
    parts_list = [{'chapters': chapters_list}]
    rt_site_data['parts'] = parts_list
    
    output = pyaml.dump(rt_site_data)
    with open(output_path, 'w') as rt_site:
        rt_site.write(output)
    
    print(f'portfolio yml staged to {output_path}')
    return

In [None]:
stage_rt_portfolio(speedmap_index_joined)

## run portfolio?

In [None]:
shared_utils.rt_utils.show_full_df(speedmap_index_joined)

In [None]:
## os chdir ..
## run script