In [3]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt
import os

from utils import *

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

### Plan

* start from all shapes for a single operator
* for each shape, assess for each hour of day x weekday/sat/sun (with day of week/time of day cols):
    * (can loop over weekday/sat/sun here at trip+stop_times join)
    * existing frequency at midpoint stop (here)
    * current total runtime
* preserve route_id in main table
* (seperately) calculate operator/routes/shapes in each Census tract
* can then join tracts to service info; characterize route service target geographically
    * generate additional frequencies/service hours/service miles for service target
* (optional) generate hypothetical trips table
* (optional) assign service hours/miles to tracts (not sure why we'd need this yet)

In [4]:
dates = get_recent_dates()

In [5]:
dates

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 9),
 'sun': datetime.date(2021, 10, 10)}

In [143]:
# dict(zip(dates.values(), dates.keys()))

In [144]:
# dates.values()

In [8]:
min_date = min(dates.values())
max_date = max(dates.values())

In [10]:
date_tbl = tbl.views.dim_date() >> select(_.date == _.full_date, _.day_name)

In [56]:
newdays = (tbl.views.gtfs_schedule_fact_daily_trips()
     >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
     >> filter(_.calitp_itp_id == 300)
     >> filter(_.date.isin(dates.values()))
     >> filter(_.is_in_service == True)
     >> inner_join(_, date_tbl, on = 'date')
     >> collect()
     )

In [104]:
tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == 300)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
                 )

In [106]:
joined = newdays >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])

In [107]:
joined

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id
0,300,2021-10-14,6317166331958172205,860352,True,Thursday,19:38:39,3,349
1,300,2021-10-14,6317166331958172205,860352,True,Thursday,19:34:00,2,34
2,300,2021-10-14,6317166331958172205,860352,True,Thursday,20:05:53,5,1308
3,300,2021-10-14,6317166331958172205,860352,True,Thursday,19:30:00,1,1344
4,300,2021-10-14,6317166331958172205,860352,True,Thursday,20:14:00,7,1301
...,...,...,...,...,...,...,...,...,...
113669,300,2021-10-10,-6288707966910887520,857067,True,Sunday,07:47:00,51,431
113670,300,2021-10-10,-6288707966910887520,857067,True,Sunday,07:16:12,14,779
113671,300,2021-10-10,-6288707966910887520,857067,True,Sunday,07:48:48,53,5
113672,300,2021-10-10,-6288707966910887520,857067,True,Sunday,07:10:23,8,769


In [108]:
joined['day_name'].value_counts()

Thursday    49928
Saturday    33747
Sunday      29999
Name: day_name, dtype: int64

In [109]:
all_days_st = joined

In [125]:
tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
    # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
    >> filter(_.calitp_itp_id == 300)
    >> select(_.trip_key, _.shape_id, _.route_id)
    >> collect()
)

In [126]:
df = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')

In [129]:
df['day_name'].value_counts()

Thursday    49928
Saturday    33747
Sunday      29999
Name: day_name, dtype: int64

## keep middle for each

In [130]:
df.stop_id = df.stop_id.astype('int64')

In [131]:
##TODO group by shape id
middle_stops = df >> group_by(_.calitp_itp_id, _.shape_id) >> summarize(middle_stop = _.stop_sequence.median())
middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')

In [132]:
middle_st = middle_stops >> select(_.stop_sequence == _.middle_stop, _.shape_id) >> inner_join(_, df, on=['shape_id', 'stop_sequence'])

In [133]:
middle_st

Unnamed: 0,stop_sequence,shape_id,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_id,route_id
0,16,25311,300,2021-10-14,-8040042882390732694,855999,True,Thursday,06:13:00,1637,3328
1,16,25311,300,2021-10-14,-6634353861742551059,856000,True,Thursday,05:28:00,1637,3328
2,21,25313,300,2021-10-10,-3842005387251362243,855880,True,Sunday,17:33:47,375,3328
3,21,25313,300,2021-10-10,6700440735954601053,855863,True,Sunday,23:00:25,375,3328
4,21,25313,300,2021-10-09,-8516077315610626044,855754,True,Saturday,13:25:13,375,3328
...,...,...,...,...,...,...,...,...,...,...,...
3119,4,25402,300,2021-10-14,-9023177547330832211,860356,True,Thursday,07:23:00,621,3352
3120,4,25402,300,2021-10-14,7132077352117529923,860355,True,Thursday,07:02:00,621,3352
3121,4,25402,300,2021-10-14,8793580671061850921,860354,True,Thursday,06:41:00,621,3352
3122,17,25403,300,2021-10-14,-2146068324403298123,857560,True,Thursday,15:14:01,1147,3336


In [134]:
middle_st.departure_time.iloc[0]

'06:13:00'

In [135]:
def fix_gtfs_time(gtfs_timestring):
    '''Reformats a GTFS timestamp (which allows the hour to exceed 24 to mark service day continuity)
    to standard 24-hour time.
    '''
    split = gtfs_timestring.split(':')
    hour = int(split[0])
    if hour >= 24:
        split[0] = str(hour - 24)
        corrected = (':').join(split)
        return corrected.strip()
    else:
        return gtfs_timestring.strip()

In [136]:
middle_st.departure_time = middle_st.departure_time.apply(fix_gtfs_time)

In [137]:
middle_st['departure_dt'] = middle_st['departure_time'].apply(lambda x:
                                                                dt.datetime.strptime(x, '%H:%M:%S'))
middle_st['departure_hour'] = middle_st['departure_dt'].apply(lambda x: x.hour)

#### Shape trip count by hour-- TODO expand to day of week, operator levels

In [138]:
df = middle_st >> count(_.shape_id, _.route_id, _.departure_hour, _.day_name, sort = True)

In [139]:
df['day_name'].unique()

array(['Thursday', 'Sunday', 'Saturday'], dtype=object)

In [142]:
df >> arrange(-_.n)

Unnamed: 0,shape_id,route_id,departure_hour,day_name,n
0,25315,3328,18,Thursday,7
1,25315,3328,17,Thursday,6
2,25314,3328,16,Thursday,6
3,25315,3328,8,Thursday,6
4,25315,3328,9,Thursday,6
...,...,...,...,...,...
1201,25346,3336,19,Sunday,1
1202,25346,3336,19,Saturday,1
1203,25346,3336,17,Sunday,1
1204,25346,3336,17,Saturday,1


In [140]:
(tbl.views.gtfs_schedule_dim_routes() 
     >> filter(_.calitp_itp_id == 300)
     >> filter(_.route_id == '3328'))

Unnamed: 0,route_key,calitp_itp_id,calitp_url_number,route_id,route_type,agency_id,route_short_name,route_long_name,route_desc,route_url,...,route_continuous_drop_off,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_deleted_at
0,5298754852876447831,300,0,3328,3,6216179,1,Main St & Santa Monica Blvd/UCLA,,http://bigbluebus.com/Routes-and-Schedules/Rou...,...,,Big Blue Bus,http://www.bigbluebus.com,America/Los_Angeles,en,310-451-5444,,,2021-07-27,2099-01-01


In [74]:
middle_st >> filter(_.shape_id == '25315')

Unnamed: 0,stop_sequence,shape_id,trip_id,departure_time,stop_time_key,trip_key,route_id,stop_id,departure_dt,departure_hour
90,23,25315,856128,11:48:53,-8326571985010357471,5934560359711113263,3328,325,1900-01-01 11:48:53,11
91,23,25315,856146,08:46:53,-5578405365431413085,4693523290513956173,3328,325,1900-01-01 08:46:53,8
92,23,25315,856116,13:51:53,-2592193845754151739,-1814272882911542131,3328,325,1900-01-01 13:51:53,13
93,23,25315,856091,18:00:39,-7956451856912978944,7473260928576424592,3328,325,1900-01-01 18:00:39,18
94,23,25315,856087,18:38:15,6614307895849206963,5057111300636375037,3328,325,1900-01-01 18:38:15,18
...,...,...,...,...,...,...,...,...,...,...
161,23,25315,856100,16:32:28,4105401700056768686,6710153630310311938,3328,325,1900-01-01 16:32:28,16
162,23,25315,856151,07:55:53,3495629271157421505,8137427846320301694,3328,325,1900-01-01 07:55:53,7
163,23,25315,856095,17:22:28,8518523843135418333,4913408361454154644,3328,325,1900-01-01 17:22:28,17
164,23,25315,856104,15:51:53,-6054036106565546413,-4086913101323362626,3328,325,1900-01-01 15:51:53,15
