# Desired order for setting up `gtfs_utils_v2`

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(200_000_000_000)

import datetime
import pandas as pd
import siuba # type hint

from calitp.tables import tbls
from siuba import *
from typing import Union

import gtfs_utils_v2
analysis_date = datetime.date(2023, 1, 12)



## Step 1: Airtable's `dim_gtfs_datasets` with `fct_daily_scheduled_feeds`

* Filter out ones that are not deprecated (use `data_quality_pipeline`)
* Allow any other custom filtering to be done, such as the default of getting scheduled data only
* Analysts now should look at output and decide if there's additional filtering needed. 
* Once filtering is done, input the df in to merge with `fct_daily_feeds` to get `feed_keys`
* Use `feed_key` to traverse all the other tables

In [None]:
'''
# downloaded 1/3/23
dim_gtfs_datasets = (tbls.mart_transit_database.dim_gtfs_datasets()
>> filter(_.data_quality_pipeline == True) # if True, we can use
>> rename(date = _.calitp_extracted_at)
>> collect()
)

# the date is the date extracted, not the date for which we're running things
dim_gtfs_datasets.to_parquet("dim_gtfs_datasets.parquet")
'''


'''
fact_daily_schedule_feeds = (tbls.mart_gtfs.fct_daily_schedule_feeds()
                             >> filter(_.date==analysis_date)
                             >> collect()
                            )

fact_daily_schedule_feeds.to_parquet("fct_daily_schedule_feeds.parquet")
'''

In [None]:
dim_gtfs_datasets = pd.read_parquet("dim_gtfs_datasets.parquet")

## Test feed options

["customer_facing", "use_subfeeds", 
                          "current_feeds", "include_precursor", 
                          "include_precursor_and_future",
                         ] 

In [None]:
full_df = gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = analysis_date,
    keep_cols = None,
    get_df = True,
    feed_option = "")

In [None]:
full_df.shape

In [None]:
def num_rows_and_other_stats(df: pd.DataFrame, feed_option: str):
    """
    Get stats for different filtering to double check.
    """
    subset_df = df >> gtfs_utils_v2.filter_feed_options(feed_option) 
    
    print(f"# rows: {len(subset_df)}")
    print("---------------")
    print(f"regional_feed_type: {subset_df.regional_feed_type.value_counts()}")
    print("---------------")
    print(f"is_future: {subset_df.is_future.value_counts()}")

In [None]:
num_rows_and_other_stats(full_df, "customer_facing")

In [None]:
num_rows_and_other_stats(full_df, "use_subfeeds")

In [None]:
num_rows_and_other_stats(full_df, "current_feeds")

In [None]:
num_rows_and_other_stats(full_df, "include_precursor") 

In [None]:
num_rows_and_other_stats(full_df, "include_precursor_and_future") 

In [2]:
test_cases = [
    "Big Blue Bus Schedule", 
    "Metrolink Schedule"
]

test_feed_keys = [
    "008d5112a7e531d0562d26e34d77869d", # Sacramento Schedule
    "f8d3bfd9e780aa3b3ce1340b2116513f" # Long Beach Schedule
]

test_gtfs_dataset_keys = ["reccgBgsKC5J7BD8K"]

In [6]:
 gtfs_utils_v2.schedule_daily_feed_to_organization(
        selected_date = analysis_date,
        keep_cols = None,
        get_df = False,
        feed_option = "use_subfeeds"
    )

Unnamed: 0,key,date,feed_key,base64_url,gtfs_dataset_key,is_future,type,regional_feed_type,name
0,9ab28255efd3800d0929e7e6fe71d682,2023-01-12,008d5112a7e531d0562d26e34d77869d,aHR0cHM6Ly9pcG9ydGFsLnNhY3J0LmNvbS9HVEZTL1NSVE...,recbzZQUIdMmFvm1r,False,schedule,,Sacramento Schedule
1,939b1adb1fba85994b3728a7127d8098,2023-01-12,00ce39f3ecbd4c8a065b006e550bdf78,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,recI7kYNWSEKTMMiq,False,schedule,Regional Precursor Feed,Fairfield Schedule
2,66afc77ef0b3ca127a0c0ea60fd402b6,2023-01-12,04a2bb9727f58083d3622ca13e5ab97c,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,rec2Irh2irjkQUoFt,False,schedule,,Eastern Sierra Schedule
3,ed9ddfbbc11193641b990de62903618e,2023-01-12,062563b11ac99ddec6d3bec6f613b78d,aHR0cHM6Ly9tamNhY3Rpb24uY29tL01KQ19HVEZTX1B1Ym...,recakkevxj5vOACjk,False,schedule,,Morro Bay Schedule
4,f60a157d1f4a11cda99c9bd8a55eea16,2023-01-12,071827dbd8e30629592144127920fdac,aHR0cHM6Ly90dWxhcmVjb2cub3JnL3RjYWcvZGF0YS1naX...,recQ1HOmLyzXsPnEU,False,schedule,,TCRTA Schedule


In [4]:
df_filter_by_name = (
    gtfs_utils_v2.schedule_daily_feed_to_organization(
        selected_date = analysis_date,
        keep_cols = None,
        get_df = False,
        feed_option = "use_subfeeds"
    ) >> gtfs_utils_v2.filter_operator(test_cases)
    >> collect()
)

df_filter_by_name

Unnamed: 0,key,date,feed_key,base64_url,gtfs_dataset_key,is_future,type,regional_feed_type,name
0,bbf6e6f4ec56bfe600b34e0c1ef17a45,2023-01-12,90e78003416c5b09f77a9de8f266c2be,aHR0cHM6Ly93d3cubWV0cm9saW5rdHJhaW5zLmNvbS9nbG...,recR28oQlTW8GMJue,False,schedule,,Metrolink Schedule
1,04ae8f26cbc038a49fb95f45a128124e,2023-01-12,9d4387dc55091d50c717582348508bae,aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...,recpN1dPaxhZvZQV0,False,schedule,,Big Blue Bus Schedule


In [5]:
df_filter_by_feed_key = (
    gtfs_utils_v2.schedule_daily_feed_to_organization(
        selected_date = analysis_date,
        keep_cols = None,
        get_df = False,
        feed_option = "use_subfeeds"
    ) >> gtfs_utils_v2.filter_operator(test_feed_keys)
    >> collect()
)

df_filter_by_feed_key

Unnamed: 0,key,date,feed_key,base64_url,gtfs_dataset_key,is_future,type,regional_feed_type,name
0,9ab28255efd3800d0929e7e6fe71d682,2023-01-12,008d5112a7e531d0562d26e34d77869d,aHR0cHM6Ly9pcG9ydGFsLnNhY3J0LmNvbS9HVEZTL1NSVE...,recbzZQUIdMmFvm1r,False,schedule,,Sacramento Schedule
1,01314dde88fa8e00f917833b1f1e2513,2023-01-12,f8d3bfd9e780aa3b3ce1340b2116513f,aHR0cHM6Ly9sYnRyYW5zaXQuYm94LmNvbS9zaGFyZWQvc3...,recCv3CF4elAx0dUg,False,schedule,,Long Beach Schedule


In [None]:
test_keys = dim_gtfs_datasets[dim_gtfs_datasets.name.isin(test_cases)
                 ].key.unique().tolist()

In [None]:
fact_daily_feeds = pd.read_parquet("fct_daily_schedule_feeds.parquet")

In [None]:
test_feeds = fact_daily_feeds[
    fact_daily_feeds.gtfs_dataset_key.isin(test_keys)
].feed_key.unique().tolist()

In [None]:
'''
schedule_datasets = gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = analysis_date, 
    keep_cols = ["date", "feed_key", "type", 
                 "regional_feed_type", "name"],
    get_df = True,
)

feed_types = [None, "Regional Subfeed"]
cols = ["date", "feed_key", "type", 
                 "regional_feed_type", "name"]

(schedule_datasets
 >> filter(_.regional_feed_type.isin(feed_types))
)[cols]
'''

## Step 2: trips

In [None]:
# Don't test Metrolink yet
# relies on columns in the dbt table not yet available
'''
trips_test1 = gtfs_utils_v2.get_trips(
    selected_date = analysis_date,
    operator_feeds= test_feeds,
    trip_cols = ["feed_key", "trip_id", "trip_key", 
                 "route_id", "route_key", 
                 "shape_array_key", #direction_id,
                 "service_hours", "trip_first_departure_sec", 
                 "trip_last_arrival_sec"
                ],
    get_df = True,
)
trips_test1.to_parquet("trips_test1.parquet")
'''

In [None]:
trips_test1 = pd.read_parquet("trips_test1.parquet")

## Step 2: stops

In [None]:
# wait til dbt new table ready

## Step 3: shapes

In [None]:
# wait til dbt new table ready

## Step 4: stop_times

In [None]:
test_trips = trips_test1.trip_id.unique().tolist()[:5]

In [None]:
sample_trips = trips_test1[trips_test1.trip_id.isin(test_trips)]

In [None]:
'''
st_test1 = gtfs_utils_v2.get_stop_times(
    selected_date = analysis_date,
    operator_feeds= test_feeds,
    stop_time_cols = ["feed_key", "trip_id", "stop_id", 
                      "stop_sequence", 
                      "arrival_sec", "departure_sec"
                ],
    get_df = True,
    trip_df = sample_trips
)
st_test1.to_parquet("st_test1.parquet")
'''

In [None]:
st_test1.arrival_sec.min()/3600

In [None]:
st_test1.arrival_sec.max()/3600