## Refactor GTFS Digest Portfolio work as part of Refactor Summer 2024

In [4]:
from datetime import datetime

import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### 6/26 `section1.total_service_hours()` isn't working anymore

In [2]:
organization_name = "Alameda-Contra Costa Transit District"

In [5]:
name = section1.organization_name_crosswalk(organization_name)

In [6]:
name

'Bay Area 511 AC Transit Schedule'

In [7]:
operator_route_map = section1.load_operator_map(name)

In [11]:
# operator_route_map.explore()

In [9]:
# scheduled_service = section1.total_service_hours_all_months(name)

In [12]:
from shared_utils import catalog_utils, rt_dates, rt_utils

In [13]:
apr_23 = rt_dates.get_week(month="apr2023", exclude_wed=False)
oct_23 = rt_dates.get_week(month="oct2023", exclude_wed=False)

In [15]:
apr_24 = rt_dates.get_week(month="apr2024", exclude_wed=False)

In [16]:
apr_24

['2024-04-15',
 '2024-04-16',
 '2024-04-17',
 '2024-04-18',
 '2024-04-19',
 '2024-04-20',
 '2024-04-21']

In [14]:
apr_23

['2023-04-10',
 '2023-04-11',
 '2023-04-12',
 '2023-04-13',
 '2023-04-14',
 '2023-04-15',
 '2023-04-16']

In [17]:
FILE = GTFS_DATA_DICT.schedule_downloads.trips

In [19]:
COMPILED_CACHED_VIEWS

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'

In [18]:
FILE

'trips'

In [27]:
apr_16_file = "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2024-04-16.parquet"

In [23]:
columns = [
    "name",
    "service_date",
    "route_long_name",
    "trip_first_departure_datetime_pacific",
    "service_hours",
]

In [30]:
apr_16_df = pd.read_parquet(apr_16_file)[columns]

In [31]:
apr_16_df.columns

Index(['name', 'service_date', 'route_long_name',
       'trip_first_departure_datetime_pacific', 'service_hours'],
      dtype='object')

In [32]:
apr_17_file = "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2024-04-17.parquet"

In [33]:
apr_17_df = pd.read_parquet(apr_17_file)[columns]

In [34]:
apr_17_df.columns

Index(['name', 'service_date', 'route_long_name',
       'trip_first_departure_datetime_pacific', 'service_hours'],
      dtype='object')

In [35]:
apr_24

['2024-04-15',
 '2024-04-16',
 '2024-04-17',
 '2024-04-18',
 '2024-04-19',
 '2024-04-20',
 '2024-04-21']

In [36]:
date_list = [
    "2024-04-16",
    "2024-04-17",
]

In [37]:
df = (
    time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        FILE,
        date_list,
        data_type="df",
        columns=[
            "name",
            "service_date",
            "route_long_name",
            "trip_first_departure_datetime_pacific",
            "service_hours",
        ],
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [38]:
apr_24_df = (
    time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        FILE,
        apr_24,
        data_type="df",
        columns=[
            "name",
            "service_date",
            "route_long_name",
            "trip_first_departure_datetime_pacific",
            "service_hours",
        ],
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [41]:
apr_24_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2024-04-15,Route 18,2024-04-15 15:45:00,0.583333
1,LA Metro Bus Schedule,2024-04-15,Metro Local Line,2024-04-15 07:48:00,1.466667


#### October and April 2023 stopped working.

In [47]:
oct_23

['2023-10-09',
 '2023-10-10',
 '2023-10-11',
 '2023-10-12',
 '2023-10-13',
 '2023-10-14',
 '2023-10-15']

In [50]:
oct_9_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-09.parquet"
)[columns]
oct_9_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-09,Route 11,2023-10-09 13:40:00,0.6
1,VCTC GMV Schedule,2023-10-09,Route 11,2023-10-09 12:40:00,0.6


In [51]:
oct_10_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-10.parquet"
)[columns]
oct_10_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-10,Route 11,2023-10-10 18:35:00,0.55
1,VCTC GMV Schedule,2023-10-10,Route 11,2023-10-10 08:40:00,0.6


In [52]:
oct_11_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-11.parquet"
)[columns]
oct_11_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-11,Route 11,2023-10-11 10:40:00,0.6
1,VCTC GMV Schedule,2023-10-11,Route 11,2023-10-11 17:40:00,0.55


In [53]:
oct_12_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-12.parquet"
)[columns]
oct_12_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-12,Route 11,2023-10-12 13:40:00,0.6
1,VCTC GMV Schedule,2023-10-12,Route 11,2023-10-12 18:05:00,0.55


In [54]:
oct_13_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-13.parquet"
)[columns]
oct_13_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-13,Route 11,2023-10-13 15:40:00,0.6
1,VCTC GMV Schedule,2023-10-13,Route 11,2023-10-13 18:05:00,0.55


In [48]:
oct_14_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-14.parquet"
)[columns]

In [45]:
oct_14_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-14,Route 11,2023-10-14 10:50:00,0.616667
1,VCTC GMV Schedule,2023-10-14,Route 11,2023-10-14 08:00:00,0.583333


In [55]:
oct_15_23_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2023-10-15.parquet"
)[columns]
oct_15_23_df.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-10-15,Route 11,2023-10-15 12:20:00,0.566667
1,VCTC GMV Schedule,2023-10-15,Route 11,2023-10-15 19:25:00,0.55


In [46]:
 time_series_utils.concatenate_datasets_across_dates??

[0;31mSignature:[0m
[0mtime_series_utils[0m[0;34m.[0m[0mconcatenate_datasets_across_dates[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mgcs_bucket[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdataset_name[0m[0;34m:[0m [0mLiteral[0m[0;34m[[0m[0;34m'speeds_route_dir_segments'[0m[0;34m,[0m [0;34m'speeds_route_dir'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdate_list[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m:[0m [0mLiteral[0m[0;34m[[0m[0;34m'df'[0m[0;34m,[0m [0;34m'gdf'[0m[0;34m][0m [0;34m=[0m [0;34m'gdf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mget_pandas[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31m

In [40]:
oct_23_df = (
    time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        FILE,
        oct_23,
        data_type="df",
        columns=[
            "name",
            "service_date",
            "route_long_name",
            "trip_first_departure_datetime_pacific",
            "service_hours",
        ],
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 98412 and the array at index 1 has size 101767

In [39]:
apr_23_df = (
    time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        FILE,
        apr_23,
        data_type="df",
        columns=[
            "name",
            "service_date",
            "route_long_name",
            "trip_first_departure_datetime_pacific",
            "service_hours",
        ],
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 109006 and the array at index 1 has size 108213