-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Break up daily scheduled service index (#2076)
* mock up all scheduled service * add calendar key * wip * working refactor that has discrepancies with prod * yaml and cleanup * add more tests, some per pr review * only build warehouse image on actual changes to image * I forgot to export requirements Co-authored-by: Andrew Vaccaro <atvaccaro@gmail.com>
- Loading branch information
1 parent
43fb1ed
commit 333455b
Showing
6 changed files
with
211 additions
and
125 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
58 changes: 58 additions & 0 deletions
58
warehouse/models/intermediate/gtfs/int_gtfs_schedule__all_scheduled_service.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
{{ config(materialized='table') }} | ||
|
||
WITH dim_calendar_dates AS ( | ||
SELECT * | ||
FROM {{ ref('dim_calendar_dates') }} | ||
), | ||
|
||
int_gtfs_schedule__long_calendar AS ( | ||
SELECT * | ||
FROM {{ ref('int_gtfs_schedule__long_calendar') }} | ||
), | ||
|
||
boolean_calendar_dates AS ( | ||
SELECT | ||
-- at time of writing, this will be identical to `calendar_dates_key`, but just in case? | ||
{{ dbt_utils.surrogate_key(['feed_key', 'service_id', 'date']) }} AS key, | ||
date AS service_date, | ||
feed_key, | ||
key AS calendar_dates_key, | ||
service_id, | ||
CASE | ||
WHEN exception_type = 1 THEN TRUE | ||
WHEN exception_type = 2 THEN FALSE | ||
END AS service_bool | ||
FROM dim_calendar_dates | ||
), | ||
|
||
daily_services AS ( | ||
SELECT | ||
-- these values will be identical so doesn't matter which is first in coalesce | ||
COALESCE(long_cal.key, cal_dates.key) AS key, | ||
COALESCE(long_cal.service_date, cal_dates.service_date) AS service_date, | ||
COALESCE(long_cal.feed_key, cal_dates.feed_key) AS feed_key, | ||
COALESCE(long_cal.service_id, cal_dates.service_id) AS service_id, | ||
-- calendar_dates takes precedence if present: it can modify calendar | ||
-- if no calendar_dates, use calendar | ||
-- if neither, no service | ||
COALESCE(cal_dates.service_bool, long_cal.service_bool) AS service_bool, | ||
calendar_key, | ||
calendar_dates_key | ||
FROM int_gtfs_schedule__long_calendar AS long_cal | ||
FULL OUTER JOIN boolean_calendar_dates AS cal_dates | ||
USING (key) | ||
), | ||
|
||
int_gtfs_schedule__all_scheduled_service AS ( | ||
SELECT | ||
key, | ||
service_date, | ||
feed_key, | ||
calendar_key, | ||
calendar_dates_key, | ||
service_id | ||
FROM daily_services | ||
WHERE service_bool | ||
) | ||
|
||
SELECT * FROM int_gtfs_schedule__all_scheduled_service |
79 changes: 12 additions & 67 deletions
79
warehouse/models/intermediate/gtfs/int_gtfs_schedule__daily_scheduled_service_index.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,83 +1,28 @@ | ||
{{ config(materialized='table') }} | ||
|
||
WITH dim_calendar_dates AS ( | ||
SELECT * | ||
FROM {{ ref('dim_calendar_dates') }} | ||
), | ||
|
||
int_gtfs_schedule__long_calendar AS ( | ||
SELECT * | ||
FROM {{ ref('int_gtfs_schedule__long_calendar') }} | ||
), | ||
|
||
fct_daily_schedule_feeds AS ( | ||
WITH fct_daily_schedule_feeds AS ( | ||
SELECT | ||
*, | ||
EXTRACT(DAYOFWEEK FROM date) AS day_num | ||
FROM {{ ref('fct_daily_schedule_feeds') }} | ||
), | ||
|
||
boolean_calendar_dates AS ( | ||
SELECT | ||
date, | ||
feed_key, | ||
service_id, | ||
CASE | ||
WHEN exception_type = 1 THEN TRUE | ||
WHEN exception_type = 2 THEN FALSE | ||
END AS has_service | ||
FROM dim_calendar_dates | ||
), | ||
|
||
-- decide that exception type 2 trumps exception type 1 | ||
-- i.e., if same date appears twice with two exception types | ||
-- the cancelation wins and we say no service on that date | ||
-- (this generally shouldn't happen) | ||
summarize_calendar_dates AS ( | ||
SELECT | ||
date, | ||
feed_key, | ||
service_id, | ||
LOGICAL_AND(has_service) AS has_service | ||
FROM boolean_calendar_dates | ||
GROUP BY date, feed_key, service_id | ||
), | ||
|
||
daily_services AS ( | ||
|
||
SELECT | ||
daily_feeds.date AS service_date, | ||
cal_dates.date AS cd_date, | ||
daily_feeds.feed_key, | ||
long_cal.service_id AS calendar_service_id, | ||
long_cal.has_service AS calendar_has_service, | ||
cal_dates.service_id AS calendar_dates_service_id, | ||
cal_dates.has_service AS calendar_dates_has_service, | ||
COALESCE(long_cal.service_id, cal_dates.service_id) AS service_id, | ||
-- calendar_dates takes precedence if present: it can modify calendar | ||
-- if no calendar_dates, use calendar | ||
-- if neither, no service | ||
COALESCE(cal_dates.has_service, long_cal.has_service, FALSE) AS has_service | ||
FROM fct_daily_schedule_feeds AS daily_feeds | ||
LEFT JOIN int_gtfs_schedule__long_calendar AS long_cal | ||
ON daily_feeds.feed_key = long_cal.feed_key | ||
AND daily_feeds.day_num = long_cal.day_num | ||
AND daily_feeds.date BETWEEN long_cal.start_date AND long_cal.end_date | ||
LEFT JOIN summarize_calendar_dates AS cal_dates | ||
ON daily_feeds.feed_key = cal_dates.feed_key | ||
AND daily_feeds.date = cal_dates.date | ||
AND (long_cal.service_id = cal_dates.service_id OR long_cal.service_id IS NULL) | ||
all_scheduled_service AS ( | ||
SELECT * | ||
FROM {{ ref('int_gtfs_schedule__all_scheduled_service') }} | ||
), | ||
|
||
int_gtfs_schedule__daily_scheduled_service_index AS ( | ||
SELECT | ||
service_date, | ||
cd_date, | ||
feed_key, | ||
service_id | ||
FROM daily_services | ||
WHERE service_id IS NOT NULL | ||
AND has_service | ||
fct_daily_schedule_feeds.feed_key, | ||
service_id, | ||
calendar_key, | ||
calendar_dates_key | ||
FROM all_scheduled_service | ||
INNER JOIN fct_daily_schedule_feeds | ||
ON all_scheduled_service.feed_key = fct_daily_schedule_feeds.feed_key | ||
AND all_scheduled_service.service_date = fct_daily_schedule_feeds.date | ||
) | ||
|
||
SELECT * FROM int_gtfs_schedule__daily_scheduled_service_index |
42 changes: 22 additions & 20 deletions
42
warehouse/models/intermediate/gtfs/int_gtfs_schedule__long_calendar.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.