Skip to content

Commit

Permalink
working refactor that has discrepancies with prod
Browse files Browse the repository at this point in the history
  • Loading branch information
lauriemerrell committed Dec 14, 2022
1 parent 039858a commit ff7def0
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 68 deletions.
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
{{ config(materialized='table') }}

WITH date_spine AS (
SELECT *,
EXTRACT(DAYOFWEEK FROM date_day) AS day_num
FROM {{ ref('util_gtfs_schedule_v2_date_spine') }}
),

dim_calendar_dates AS (
WITH dim_calendar_dates AS (
SELECT *
FROM {{ ref('dim_calendar_dates') }}
),
Expand All @@ -18,56 +12,32 @@ int_gtfs_schedule__long_calendar AS (

boolean_calendar_dates AS (
SELECT
date,
date AS service_date,
feed_key,
key AS calendar_dates_key,
key,
service_id,
CASE
WHEN exception_type = 1 THEN TRUE
WHEN exception_type = 2 THEN FALSE
END AS has_service
END AS service_bool
FROM dim_calendar_dates
),

-- decide that exception type 2 trumps exception type 1
-- i.e., if same date appears twice with two exception types
-- the cancelation wins and we say no service on that date
-- (this generally shouldn't happen)
summarize_calendar_dates AS (
SELECT
date,
feed_key,
calendar_dates_key,
service_id,
LOGICAL_AND(has_service) AS has_service
FROM boolean_calendar_dates
GROUP BY date, feed_key, calendar_dates_key, service_id
),

daily_services AS (

SELECT
date_spine.date_day AS service_date,
-- these values will be identical so doesn't matter which is first in coalesce
COALESCE(long_cal.service_date, cal_dates.service_date) AS service_date,
COALESCE(long_cal.feed_key, cal_dates.feed_key) AS feed_key,
calendar_key,
calendar_dates_key,
long_cal.service_id AS calendar_service_id,
long_cal.has_service AS calendar_has_service,
cal_dates.service_id AS calendar_dates_service_id,
cal_dates.has_service AS calendar_dates_has_service,
COALESCE(long_cal.service_id, cal_dates.service_id) AS service_id,
-- calendar_dates takes precedence if present: it can modify calendar
-- if no calendar_dates, use calendar
-- if neither, no service
COALESCE(cal_dates.has_service, long_cal.has_service, FALSE) AS has_service
FROM date_spine
LEFT JOIN int_gtfs_schedule__long_calendar AS long_cal
ON date_spine.day_num = long_cal.day_num
AND date_spine.date_day BETWEEN long_cal.start_date AND long_cal.end_date
LEFT JOIN summarize_calendar_dates AS cal_dates
ON date_spine.date_day = cal_dates.date
AND (long_cal.feed_key = cal_dates.feed_key OR long_cal.feed_key IS NULL)
AND (long_cal.service_id = cal_dates.service_id OR long_cal.service_id IS NULL)
COALESCE(cal_dates.service_bool, long_cal.service_bool) AS service_bool,
calendar_key,
cal_dates.key AS calendar_dates_key
FROM int_gtfs_schedule__long_calendar AS long_cal
FULL OUTER JOIN boolean_calendar_dates AS cal_dates
USING (key)
),

int_gtfs_schedule__all_scheduled_service AS (
Expand All @@ -78,8 +48,7 @@ int_gtfs_schedule__all_scheduled_service AS (
calendar_dates_key,
service_id
FROM daily_services
WHERE service_id IS NOT NULL
AND has_service
WHERE service_bool
)

SELECT * FROM int_gtfs_schedule__all_scheduled_service
Original file line number Diff line number Diff line change
@@ -1,35 +1,32 @@
{{ config(materialized='table',
cluster_by = ['day_num', 'start_date']
) }}
{{ config(materialized='table') }}

-- TODO: make an intermediate calendar and use that instead of the dimension
WITH dim_calendar AS (
SELECT *
FROM {{ ref('dim_calendar') }}
),

-- TODO: see if this can be refactored using UNPIVOT (logic inherited from v1 warehouse, wondering if it should be revisited)
int_gtfs_schedule__long_calendar AS (
-- Note that you can unnest values easily in SQL, but getting the column names
-- is weirdly hard. To work around this, we just UNION ALL.
{% for dow in [("monday", 2), ("tuesday", 3), ("wednesday", 4), ("thursday", 5), ("friday", 6), ("saturday", 7), ("sunday", 0)] %}

{% if not loop.first %}
UNION ALL
{% endif %}

SELECT
base64_url,
key AS calendar_key,
feed_key,
service_id,
start_date,
end_date,
"{{ dow[0] }}" AS day_name,
{{ dow[1] }} AS day_num,
CAST({{ dow[0] }} AS boolean) AS has_service
FROM dim_calendar
{% endfor %}
SELECT
feed_key,
{{ dbt_utils.surrogate_key(['feed_key', 'service_id', 'dt']) }} AS key,
service_id,
dt AS service_date,
EXTRACT(DAYOFWEEK FROM dt) AS day_num,
CASE
WHEN EXTRACT(DAYOFWEEK FROM dt) = 1 THEN CAST(sunday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 2 THEN CAST(monday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 3 THEN CAST(tuesday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 4 THEN CAST(wednesday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 5 THEN CAST(thursday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 6 THEN CAST(friday AS bool)
WHEN EXTRACT(DAYOFWEEK FROM dt) = 7 THEN CAST(saturday AS bool)
END AS service_bool,
key AS calendar_key
FROM dim_calendar
-- one row per day between calendar service start and end date
-- https://stackoverflow.com/questions/38694040/how-to-generate-date-series-to-occupy-absent-dates-in-google-biqquery/58169269#58169269
LEFT JOIN UNNEST(GENERATE_DATE_ARRAY(start_date, LEAST(end_date, DATE_ADD(CURRENT_DATE(), INTERVAL 1 YEAR)))) AS dt
)

SELECT *
Expand Down

0 comments on commit ff7def0

Please sign in to comment.