Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change GTFS RT / schedule mapping to handle schedule download failures #2899

Merged
merged 2 commits into from
Aug 17, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 44 additions & 7 deletions warehouse/macros/gtfs_rt_messages_keying.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,42 @@ WITH
fct_daily_schedule_feeds AS (
SELECT *
FROM {{ ref('fct_daily_schedule_feeds') }}
),

-- follow int_transit_database__urls_to_gtfs_datasets example
-- to handle cases where a feed fails to download on a given day
-- schedule models fill in gaps if download later succeeds, but RT data does not
-- because RT models are incremental
-- we want to map RT data to most recent download
-- especially for schedule feed timezone

appearance_duration AS (
SELECT
base64_url,
MAX(_valid_to) AS latest_app,
MIN(_valid_from) AS first_app
FROM dim_schedule_feeds
GROUP BY base64_url
),

extend_schedule_dates AS (
SELECT
dim_schedule_feeds.base64_url,
dim_schedule_feeds.key AS schedule_feed_key,
dim_schedule_feeds.feed_timezone,
-- backdate start of first URL/record relationship so data scraped before record can be mapped
CASE
WHEN dim_schedule_feeds._valid_from = appearance_duration.first_app THEN CAST('1900-01-01' AS TIMESTAMP)
ELSE dim_schedule_feeds._valid_from
END AS _valid_from,
-- forward date end of last URL/record relationship to allow for mapping if schedule feed fails to download while RT still online
CASE
WHEN dim_schedule_feeds._valid_to = appearance_duration.latest_app THEN {{ make_end_of_valid_range('CAST("2099-01-01" AS TIMESTAMP)') }}
ELSE dim_schedule_feeds._valid_to
END AS _valid_to,
FROM dim_schedule_feeds
LEFT JOIN appearance_duration
USING (base64_url)
)

-- if we ever backfill v1 RT data, the reliance on _config_extract_ts for joins in this table may become problematic
Expand All @@ -31,11 +67,12 @@ WITH
schedule_datasets.key AS schedule_gtfs_dataset_key,
schedule_datasets.base64_url AS schedule_base64_url,
schedule_datasets.name AS schedule_name,
COALESCE(dim_schedule_feeds.key, fct_daily_schedule_feeds.feed_key) AS schedule_feed_key,
-- TODO: coalescing to America/Los_Angeles at the end here is a bit of a blunt instrument to ensure the field is populated
COALESCE(extend_schedule_dates.schedule_feed_key, fct_daily_schedule_feeds.feed_key) AS schedule_feed_key,
-- extend_schedule_dates + fallback to daily feed should ensure that timezone is populated
-- but just in case...
-- coalescing to America/Los_Angeles at the end here is a bit of a blunt instrument to ensure the field is populated
-- America/Los_Angeles is the most common time zone by a huge margin, so we assume it's a good guess
-- we could do more advanced imputation eventually; the issue here is cases where RT data was downloaded but schedule wasn't available for some reason
COALESCE(dim_schedule_feeds.feed_timezone, fct_daily_schedule_feeds.feed_timezone, "America/Los_Angeles") AS schedule_feed_timezone,
COALESCE(extend_schedule_dates.feed_timezone, fct_daily_schedule_feeds.feed_timezone, "America/Los_Angeles") AS schedule_feed_timezone,
rt.* EXCEPT(_name)
FROM {{ raw_messages }} AS rt
LEFT JOIN urls_to_gtfs_datasets
Expand All @@ -52,9 +89,9 @@ WITH
AND rt._config_extract_ts BETWEEN schedule_datasets._valid_from AND schedule_datasets._valid_to
-- use _extract_ts for this join because now we're looking at the actual data
-- we want the schedule feed that was in effect when this RT data was actually scraped
LEFT JOIN dim_schedule_feeds
ON schedule_datasets.base64_url = dim_schedule_feeds.base64_url
AND rt._extract_ts BETWEEN dim_schedule_feeds._valid_from AND dim_schedule_feeds._valid_to
LEFT JOIN extend_schedule_dates
ON schedule_datasets.base64_url = extend_schedule_dates.base64_url
AND rt._extract_ts BETWEEN extend_schedule_dates._valid_from AND extend_schedule_dates._valid_to
-- use this as a fallback for cases where the above join fails because the schedule URL changed
-- and RT data was downloaded that references a new URL that hasn't had data downloaded yet
LEFT JOIN fct_daily_schedule_feeds
Expand Down