# Trips query and aggregate

Use `views.gtfs_schedule_fact_daily_trips` to group into time-of-day based on trip first departure.

Also, just use the `n_stops`, `n_stop_times`, and `service_hours` columns to aggregate to shape_id / day.

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import dask.dataframe as dd

from calitp.tables import tbl
from siuba import *

from shared_utils import rt_utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/"



In [2]:
date = "2022-09-14"
trips = dd.read_parquet(f"{GCS_FILE_PATH}trips_{date}.parquet")

In [3]:
'''
ITP_ID = 183

day1 = (tbl.views.gtfs_schedule_fact_daily_trips()
         >> filter(_.calitp_itp_id == ITP_ID, 
           _.service_date == "2022-09-14",
           _.is_in_service == True
          )
        >> select(_.calitp_itp_id, _.calitp_url_number, _.service_date,
                  _.trip_id, _.trip_first_departure_ts, 
                  _.service_hours, _.n_stops, _.n_stop_times
          )
        >> collect()
)

day2 = (tbl.views.gtfs_schedule_fact_daily_trips()
        >> filter(_.calitp_itp_id == ITP_ID, 
           _.service_date == "2022-10-12", 
           _.is_in_service == True
          )
        >> select(_.calitp_itp_id, _.calitp_url_number, _.service_date,
                  _.trip_id, _.trip_first_departure_ts, 
                  _.service_hours, _.n_stops, _.n_stop_times
          )
        >> collect()
)

day1.to_parquet("day1.parquet")
day2.to_parquet("day2.parquet")
'''

'\nITP_ID = 183\n\nday1 = (tbl.views.gtfs_schedule_fact_daily_trips()\n         >> filter(_.calitp_itp_id == ITP_ID, \n           _.service_date == "2022-09-14",\n           _.is_in_service == True\n          )\n        >> select(_.calitp_itp_id, _.calitp_url_number, _.service_date,\n                  _.trip_id, _.trip_first_departure_ts, \n                  _.service_hours, _.n_stops, _.n_stop_times\n          )\n        >> collect()\n)\n\nday2 = (tbl.views.gtfs_schedule_fact_daily_trips()\n        >> filter(_.calitp_itp_id == ITP_ID, \n           _.service_date == "2022-10-12", \n           _.is_in_service == True\n          )\n        >> select(_.calitp_itp_id, _.calitp_url_number, _.service_date,\n                  _.trip_id, _.trip_first_departure_ts, \n                  _.service_hours, _.n_stops, _.n_stop_times\n          )\n        >> collect()\n)\n\nday1.to_parquet("day1.parquet")\nday2.to_parquet("day2.parquet")\n'

In [4]:
trips.columns

Index(['calitp_itp_id', 'calitp_url_number', 'service_date', 'trip_key',
       'trip_id', 'route_id', 'direction_id', 'shape_id',
       'calitp_extracted_at', 'calitp_deleted_at', 'route_short_name',
       'route_long_name', 'route_desc', 'route_type'],
      dtype='object')

In [5]:
day1 = dd.read_parquet("day1.parquet")
day2 = dd.read_parquet("day2.parquet")
ddf = dd.multi.concat(
    [day1, day2], axis=0)

In [6]:
##https://stackoverflow.com/questions/39584118/dask-dataframe-how-to-convert-column-to-to-datetime
ddf = ddf.assign(
    departure_hour = dd.to_datetime(ddf.trip_first_departure_ts, 
                                    unit="s").dt.hour
)

In [8]:
ddf = ddf.assign(
    time_of_day=ddf.apply(
        lambda x: rt_utils.categorize_time_of_day(x.departure_hour), axis=1, 
        meta=('time_of_day', 'str'))
)

In [9]:
ddf.time_of_day.value_counts().compute()

PM Peak     2194
Midday      2136
AM Peak     1366
Early AM     648
Evening      240
Name: time_of_day, dtype: int64

In [17]:
trips_with_hours = dd.merge(
    trips,
    ddf,
    on = ["calitp_itp_id", "calitp_url_number", "service_date", "trip_id"],
    how = "outer"
)

In [20]:
day_time_of_day_df = (trips_with_hours.drop_duplicates()
 .groupby(["calitp_itp_id", "calitp_url_number", 
              "service_date", "shape_id", "time_of_day"])
 .agg({"service_hours": "sum",
       "n_stops": "mean", # sum would be same as stop_times
       # how to correctly count unique stops without bringing in stops table?
       # mean should be right...since shape_id means they're traveling same path
       "n_stop_times": "sum",
       "trip_id": "count",
      })
 .reset_index()
)