In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl
from calitp import query_sql
import calitp.magics
import branca

import shared_utils
import utils

from siuba import *
import pandas as pd
import geopandas as gpd
import shapely

import datetime as dt
import time
from zoneinfo import ZoneInfo

import rt_analysis as rt
import importlib

import gcsfs
fs = gcsfs.GCSFileSystem()

from tqdm import tqdm_notebook
from tqdm.notebook import trange, tqdm

E0420 21:43:05.029166046    1226 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0420 21:43:07.519894894    1226 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [2]:
importlib.reload(utils)
importlib.reload(rt)

<module 'rt_analysis' from '/home/jovyan/data-analyses/rt_delay/rt_analysis.py'>

In [3]:
air_joined = pd.read_parquet('airtable_joined.parquet')

In [4]:
pbar = tqdm()

0it [00:00, ?it/s]

#### Run all operators

In [5]:
fs_list = fs.ls(f'{utils.GCS_FILE_PATH}rt_trips/')

In [6]:
ran_operators = [int(path.split('rt_trips/')[1].split('_')[0])
                 for path in fs_list
                 if path.split('rt_trips/')[1]]

In [7]:
not_ran_operators = []

# for agency in air_joined.calitp_itp_id.unique():
for agency in []:
    if agency in ran_operators:
        print(f'already ran: {agency}')
        continue
    analysis_date = dt.date(2022, 3, 30) ##wednesday, new tables
    ## https://docs.google.com/spreadsheets/d/16tcL3fPdYkrNajDNneSR1b5ImhFOCJ0gWFnEaXyg16A/edit#gid=0
    if agency == 273:
        analysis_date = dt.date(2022, 2, 8) # #override for SacRT 
    day = str(analysis_date.day).zfill(2)
    month = str(analysis_date.month).zfill(2)
    print(f'calculating for agency: {agency}...')
    try:
        rt_day = rt.OperatorDayAnalysis(agency, analysis_date, pbar)
        delay_to_parquet = rt_day.stop_delay_view.copy()
        delay_to_parquet['delay_seconds'] = delay_to_parquet.delay.map(lambda x: x.seconds)
        delay_to_parquet['arrival_time'] = delay_to_parquet.arrival_time.map(lambda x: x.isoformat())
        delay_to_parquet['actual_time'] = delay_to_parquet.actual_time.map(lambda x: x.isoformat())
        delay_to_parquet = delay_to_parquet >> select(-_.delay)
        shared_utils.utils.geoparquet_gcs_export(delay_to_parquet,
                                         f'{utils.GCS_FILE_PATH}stop_delay_views/',
                                        f'{agency}_{month}_{day}'
                                        )
        # rt_day.set_filter(start_time='15:00', end_time='19:00')
        # m = rt_day.segment_speed_map(how='low_speeds', size = [1300, 700])
        # shared_utils.utils.geoparquet_gcs_export(rt_day.stop_segment_speed_view,
        #                                          f'{utils.GCS_FILE_PATH}segment_speed_views/',
        #                                         f'{agency}_{month}_{day}_pm_peak'
        #                                         )
        rt_day.rt_trips.to_parquet(f'{utils.GCS_FILE_PATH}rt_trips/{agency}_{month}_{day}.parquet') ## early step towards scaling
        # m.save(f'./speedmaps/{agency}_02_{day}_pm_peak.html')
        # rt_day.stop_segment_speed_view.to_parquet(f'{utils.GCS_FILE_PATH}segment_speed_views/{agency}_02_{day}_pm_peak.parquet') ## early step towards scaling
        delay_to_parquet = rt_day.endpoint_delay_view.copy()
        delay_to_parquet['delay_seconds'] = delay_to_parquet.delay.map(lambda x: x.seconds)
        delay_to_parquet = delay_to_parquet >> select(-_.delay)
        shared_utils.utils.geoparquet_gcs_export(delay_to_parquet,
                                         f'{utils.GCS_FILE_PATH}endpoint_delay_views/',
                                        f'{agency}_{month}_{day}_pm_peak'
                                        )
        print(f'complete for agency: {agency}')
    except Exception as e:
        print(f'rt failed for agency {agency}')
        not_ran_operators += [agency]
        print(e)

In [8]:
import yaml

In [9]:
airtable_organizations = (
    tbl.airtable.california_transit_organizations()
    >> select(_.itp_id, _.name, _.caltrans_district,
              _.website, _.ntp_id, _.drmt_organization_name)
    # >> filter(_.itp_id == itp_id)
    >> collect()
)

In [10]:
airtable_organizations = airtable_organizations.dropna(subset=['itp_id']) >> mutate(itp_id = _.itp_id.astype('int64'))

In [11]:
airtable_organizations = airtable_organizations >> filter(_.itp_id.isin(ran_operators)) >> arrange(_.caltrans_district)

In [12]:
airtable_organizations.head(3)

Unnamed: 0,itp_id,name,caltrans_district,website,ntp_id,drmt_organization_name
663,135,Humboldt Transit Authority,01 - Eureka,,['reccKueWkvBCZoLQs'],['recu9h9nAElLpYoYO']
701,159,Lake Transit Authority,01 - Eureka,,['recF9AnmCtEyxxDTY'],['reck7ddrpXSeA0mH0']
572,259,Redding Area Bus Authority,02 - Redding,,['recoXZjKEOPldH8QJ'],


In [13]:
with open('../portfolio/analyses.yml') as analyses:
    analyses_data = yaml.load(analyses, yaml.Loader)

In [14]:
exclude_ids = [312]

In [15]:
chapters_list = []
for district in airtable_organizations.caltrans_district.unique():
    chapter_dict = {}
    filtered = (airtable_organizations
                >> filter(_.caltrans_district == district)
                >> distinct(_.itp_id, _keep_all=True)
                >> filter(-_.itp_id.isin(exclude_ids))
               )
    chapter_dict['caption'] = f'District {district}'
    chapter_dict['params'] = {'district': district}
    chapter_dict['sections'] = [{'itp_id': itp_id} for itp_id in filtered.itp_id.to_list()]
    chapters_list += [chapter_dict]

In [16]:
parts_list = [{'chapters': chapters_list}]

In [17]:
analyses_data['sites'][1]['parts'] = parts_list

In [18]:
output = yaml.dump(analyses_data)

In [19]:
with open('../portfolio/analyses.yml', 'w') as analyses:
    analyses.write(output)

### Ongoing issues
* 358 Union City too many trips without shape?

#### Parameters
* 159 1+ maps don't render
* 167 map_from_metrics fails