## Setup

In [1]:
from siuba import *
from siuba.sql import LazyTbl
from siuba.dply import vector as vec
from siuba.dply.vector import n

from sqlalchemy import create_engine

%run _setup.ipynb

tbl = AutoTable(
    engine,
    lambda s: s.replace(".", "_").replace("test_", ""),
    lambda s: "test_" in s and "__staging" not in s
)

In [2]:
pk_str = ("calitp_itp_id", "calitp_url_number")
pk_cols = (_.calitp_itp_id, _.calitp_url_number)
tbl_agency = tbl.gtfs_schedule_agency()
tbl_routes = tbl.gtfs_schedule_routes()
tbl_trips = tbl.gtfs_schedule_trips()


In [3]:
#tbl_agency

## Main table previews

In [4]:
#tbl_agency

In [5]:
#tbl_routes

In [6]:
#tbl_trips

## Model feed trips view

### Fill in implicit agency_ids

In [7]:
expr_fill_id = _.agency_id.fillna(_.calitp_itp_id.astype(str))

tbl_feed_trips = (
    tbl_trips
    >> left_join(
        _,
        tbl_routes >> mutate(agency_id = expr_fill_id),
        [*pk_str, "route_id"]
    )
  >> left_join(
      _,
      tbl_agency >> mutate(agency_id = expr_fill_id),
      [*pk_str, "agency_id"]
  )
) 


## Model trip stop times

In [8]:
tbl_trip_stops = (tbl_trips
  >> left_join(_, tbl.gtfs_schedule_stop_times(), [*pk_str, "trip_id"])
  >> left_join(_, tbl.gtfs_schedule_stops(), [*pk_str, "stop_id"])
)

## Model tidy calendar

### Gather function

In [9]:
from siuba import gather
from siuba.sql import LazyTbl
from siuba.dply.verbs import singledispatch2

@gather.register(LazyTbl)
def _gather_sql(__data, key="key", value="value", *args, drop_na=False, convert=False):
    from siuba.dply.verbs import var_select, var_create
    from siuba.sql.verbs import lift_inner_cols
    import pandas as pd
    from sqlalchemy import sql

    if not args:
        raise NotImplementedError("must specify columns to gather as *args")

    # most recent select statement and inner columns ----
    sel = __data.last_op
    columns = lift_inner_cols(sel)
    
    # tidy select variables for gathering ----
    var_list = var_create(*args)
    od = var_select(pd.Series(columns.keys()), *var_list)

    # get sql columns corresponding to variables ----
    value_vars = [columns[k] for k in od]
    id_vars = [columns[k] for k in columns.keys() if k not in od]

    # union each key variable into long format ----
    queries = []
    for value_col in value_vars:
        # TODO: may require CTE
        subquery = (
            sel
            .with_only_columns(
                [
                    *id_vars,
                    sql.literal(value_col.name).label(key),
                    value_col.label(value),
                ]
            )
        )
        queries.append(subquery)
        
    # make union all into a subquery for now, just to be safe, since
    # siuba might not respond well to a CompoundSelect
    return __data.append_op(sql.union_all(*queries).select())



### Query

In [22]:
from siuba.sql import sql_raw

process_cal_dates = mutate(
    date=sql_raw('PARSE_DATE("%Y%m%d", date)')
) >> select(-_["exception_type", "calitp_extracted_at"], _.service_date == _.date)

date_include = (
    tbl.gtfs_schedule_calendar_dates()
    >> filter(_.exception_type == "1")
    >> mutate(service_inclusion=True)
    >> process_cal_dates
)
date_exclude = (
    tbl.gtfs_schedule_calendar_dates()
    >> filter(_.exception_type == "2")
    >> mutate(service_exclusion=True)
    >> process_cal_dates
)

tbl_schedule_daily = (
    tbl.gtfs_schedule_calendar()
    # parse dates
    >> mutate(
        start_date=sql_raw('PARSE_DATE("%Y%m%d", start_date)'),
        end_date=sql_raw('PARSE_DATE("%Y%m%d", end_date)'),
    )
    # convert wide weekday to long
    >> gather("day_name", "service_indicator", _["monday":"sunday"])
    >> mutate(day_name=_.day_name.str.title())
    # expand all dates range using calendar
    # needs to be an inner join, in case a scheduled interval is e.g. 1 day,
    # since gathering will still produce 7 rows (1 per day of week).
    >> inner_join(
        _,
        tbl.views_dim_date() >> select(_.day_name, _.full_date),
        sql_on=(
            lambda lhs, rhs: (lhs.day_name == rhs.day_name)
            & (lhs.start_date <= rhs.full_date)
            & (lhs.end_date >= rhs.full_date)      # end date is inclusive
        ),
    )
    >> select(-_.startswith("day_name"))
    >> rename(service_date="full_date", service_cal_start_date="start_date", service_cal_end_date="end_date")
    # full join, since an agency can define a schedule using only calendar dates
    # e.g. every day a service runs is specified using exceptions
    >> full_join(_, date_include, [*pk_str, "service_id", "service_date"])
    >> full_join(_, date_exclude, [*pk_str, "service_id", "service_date"])
    >> mutate(is_in_service = (_.service_indicator == "1") & ~_.service_exclusion.fillna(False) | _.service_inclusion.fillna(False))
)

# sanity check that vals are either 0 or 1
#tbl_schedule_daily >> distinct(_.is_in_service)

In [11]:
#tbl_schedule_daily >> arrange(_.calitp_itp_id, _.calitp_url_number, _.service_id, _.service_date)

In [12]:
#tbl_schedule_daily >> count()

In [13]:
recent_trips = (
    tbl_feed_trips
    >> filter(_.calitp_itp_id == 3, _.calitp_url_number == 0)
    >> select(-_.calitp_extracted_at)
    >> left_join(
        _,
        tbl_schedule_daily
        >> filter(_.service_date >= "2021-04-01", _.service_date < "2021-05-01"),
        ["calitp_itp_id", "calitp_url_number", "service_id"],
    )
)

In [14]:
# (        tbl_schedule_daily
#         >> filter(_.service_date >= "2021-04-01", _.service_date < "2021-05-01", _.calitp_itp_id == 3, _.calitp_url_number == 0)
#         >> count(_.service_id)
        
# )

In [15]:
#tbl_feed_trips >> count()

In [16]:
# n_trips * n_calendar_days
(    tbl_feed_trips
    >> filter(_.calitp_itp_id == 3, _.calitp_url_number == 0)
    >> count()
)

Unnamed: 0,n
0,5447


In [17]:
recent_trips >> count()

Unnamed: 0,n
0,163410


In [18]:
#tbl_feed_trips >> filter(_.calitp_itp_id == 3, _.calitp_url_number == 0) >> collect()

#tbl_schedule_daily >> filter(_.calitp_itp_id == 3, _.calitp_url_number == 0) >> count()

In [19]:
#tbl_feed_trips >> filter(_.service_id == "202130M-vs20213M-Saturday-04")

## Trip service hours

In [20]:
(
    tbl.gtfs_schedule_stop_times()
    >> group_by(_.calitp_itp_id, _.calitp_url_number, _.trip_id)
    >> summarize(
        first_arrival=_.arrival_time.min(), last_departure=_.departure_time.max()
    )
)

DatabaseError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 500 Query exceeded limit for bytes billed: 100000000. 489684992 or higher required.

(job ID: f8e1263b-1915-481c-84fe-f2fd0446424e)

                                                                                                                                                                      -----Query Job SQL Follows-----                                                                                                                                                                       

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:SELECT `anon_1`.`calitp_itp_id`, `anon_1`.`calitp_url_number`, `anon_1`.`trip_id`, `anon_1`.`first_arrival`, `anon_1`.`last_departure` 
   2:FROM (SELECT `test_gtfs_schedule.stop_times`.`calitp_itp_id` AS `calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number` AS `calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id` AS `trip_id`, min(`test_gtfs_schedule.stop_times`.`arrival_time`) AS `first_arrival`, max(`test_gtfs_schedule.stop_times`.`departure_time`) AS `last_departure` 
   3:FROM `test_gtfs_schedule.stop_times` GROUP BY `test_gtfs_schedule.stop_times`.`calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id`) AS `anon_1`
   4: LIMIT @`param_1`
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
[SQL: SELECT `anon_1`.`calitp_itp_id`, `anon_1`.`calitp_url_number`, `anon_1`.`trip_id`, `anon_1`.`first_arrival`, `anon_1`.`last_departure` 
FROM (SELECT `test_gtfs_schedule.stop_times`.`calitp_itp_id` AS `calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number` AS `calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id` AS `trip_id`, min(`test_gtfs_schedule.stop_times`.`arrival_time`) AS `first_arrival`, max(`test_gtfs_schedule.stop_times`.`departure_time`) AS `last_departure` 
FROM `test_gtfs_schedule.stop_times` GROUP BY `test_gtfs_schedule.stop_times`.`calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id`) AS `anon_1`
 LIMIT %(param_1:INT64)s]
[parameters: {'param_1': 5}]
(Background on this error at: http://sqlalche.me/e/13/4xp6)

DatabaseError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 500 Query exceeded limit for bytes billed: 100000000. 489684992 or higher required.

(job ID: 036ee08a-6221-48f1-a3a5-a93c3e7befbe)

                                                                                                                                                                      -----Query Job SQL Follows-----                                                                                                                                                                       

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:SELECT `anon_1`.`calitp_itp_id`, `anon_1`.`calitp_url_number`, `anon_1`.`trip_id`, `anon_1`.`first_arrival`, `anon_1`.`last_departure` 
   2:FROM (SELECT `test_gtfs_schedule.stop_times`.`calitp_itp_id` AS `calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number` AS `calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id` AS `trip_id`, min(`test_gtfs_schedule.stop_times`.`arrival_time`) AS `first_arrival`, max(`test_gtfs_schedule.stop_times`.`departure_time`) AS `last_departure` 
   3:FROM `test_gtfs_schedule.stop_times` GROUP BY `test_gtfs_schedule.stop_times`.`calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id`) AS `anon_1`
   4: LIMIT @`param_1`
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
[SQL: SELECT `anon_1`.`calitp_itp_id`, `anon_1`.`calitp_url_number`, `anon_1`.`trip_id`, `anon_1`.`first_arrival`, `anon_1`.`last_departure` 
FROM (SELECT `test_gtfs_schedule.stop_times`.`calitp_itp_id` AS `calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number` AS `calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id` AS `trip_id`, min(`test_gtfs_schedule.stop_times`.`arrival_time`) AS `first_arrival`, max(`test_gtfs_schedule.stop_times`.`departure_time`) AS `last_departure` 
FROM `test_gtfs_schedule.stop_times` GROUP BY `test_gtfs_schedule.stop_times`.`calitp_itp_id`, `test_gtfs_schedule.stop_times`.`calitp_url_number`, `test_gtfs_schedule.stop_times`.`trip_id`) AS `anon_1`
 LIMIT %(param_1:INT64)s]
[parameters: {'param_1': 5}]
(Background on this error at: http://sqlalche.me/e/13/4xp6)