In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt
import os

from utils import *

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

### Plan

* start from all shapes for a single operator
* for each shape, assess for each hour of day x weekday/sat/sun (with day of week/time of day cols):
    * (can loop over weekday/sat/sun here at trip+stop_times join)
    * existing frequency at midpoint stop (here)
    * current total runtime
* preserve route_id in main table
* (seperately) calculate operator/routes/shapes in each Census tract
* can then join tracts to service info; characterize route service target geographically
    * generate additional frequencies/service hours/service miles for service target
* (optional) generate hypothetical trips table
* (optional) assign service hours/miles to tracts (not sure why we'd need this yet)

In [2]:
dates = get_recent_dates()

In [3]:
dates

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 9),
 'sun': datetime.date(2021, 10, 10)}

In [4]:
# dict(zip(dates.values(), dates.keys()))

In [5]:
# dates.values()

In [6]:
min_date = min(dates.values())
max_date = max(dates.values())

In [7]:
date_tbl = tbl.views.dim_date() >> select(_.date == _.full_date, _.day_name)

### All operators shape/frequency table

In [134]:
def single_operator_shape_frequency(itp_id):
    
    trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
        >> filter(_.calitp_itp_id == itp_id)
        >> filter(_.date.isin(dates.values()))
        >> filter(_.is_in_service == True)
        >> inner_join(_, date_tbl, on = 'date')
        >> collect()
        )
    
    tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        ## itp_id 327 gives no results if filtered
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
        )
    
    all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])
    
    tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.trip_key, _.shape_id, _.route_id)
        >> collect()
        )
    
    st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')
    st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])
    # st_trips_joined.stop_id = st_trips_joined.stop_id.astype('int64')
    
    try:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.shape_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
    except:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.route_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
        
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')
    
    try:
        middle_st = (middle_stops
                     >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                     >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                    )
    except:
        middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.route_id)
                 >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
                )

    middle_st.departure_time = middle_st.departure_time.apply(fix_gtfs_time)
    middle_st['departure_dt'] = (middle_st['departure_time']
                                 .apply(lambda x:
                                                dt.datetime.strptime(x, '%H:%M:%S'))
                                )
    middle_st['departure_hour'] = middle_st['departure_dt'].apply(lambda x: x.hour)
    
    if middle_st['shape_id'].isna().all():
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.departure_hour, _.day_name, sort = True)
                          )
    else:
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.shape_id, _.departure_hour, _.day_name, sort = True)
                          )

    return shape_frequency

In [9]:
itp_ids = tbl.views.gtfs_agency_names() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [10]:
def multiple_operator_shape_frequency():
    
    shape_frequency = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}...', end='')
            shape_frequency = shape_frequency.append(single_operator_shape_frequency(itp_id))
            print('done!')
        except:
            print(f'failed for operator {itp_id}')
        
    return shape_frequency

In [11]:
# all_operators_shape_frequency = multiple_operator_shape_frequency()

attempting for operator 256...failed for operator 256
attempting for operator 257...done!
attempting for operator 259...done!
attempting for operator 4...done!
attempting for operator 260...done!
attempting for operator 261...done!
attempting for operator 6...done!
attempting for operator 263...done!
attempting for operator 264...done!
attempting for operator 265...done!
attempting for operator 10...done!
attempting for operator 11...done!
attempting for operator 269...done!
attempting for operator 14...done!
attempting for operator 270...done!
attempting for operator 15...done!
attempting for operator 271...failed for operator 271
attempting for operator 17...done!
attempting for operator 273...failed for operator 273
attempting for operator 18...done!
attempting for operator 274...done!
attempting for operator 21...done!
attempting for operator 278...done!
attempting for operator 23...done!
attempting for operator 279...done!
attempting for operator 24...done!
attempting for operator

In [13]:
# all_operators_shape_frequency.to_parquet('./shape_frequency.parquet')

In [133]:
## failures

## fatal:

## 474: new feed not fully ingested as of key dates
## 256, 273, 289, 62, 325, 344, 349, 97, 111, 372: no records in fact_daily_trips
## 206, 207, 164, 394, 390, 254, 338: no oct '21 service in fact_daily_trips
## 271, 312, 87, : no trips in service on key dates

## correctable (and added):

## 327, 377: loaded/deleted filter excludes valid stop times?
## 323: no shape_id in feed
## 142, 235: some na shape_id in feed
## 228: some na departure_times 

In [143]:
all_operators_shape_frequency = pd.read_parquet('./shape_frequency.parquet')

### Sandbox

In [22]:
single_operator_shape_frequency(327)

ValueError: cannot insert calitp_itp_id, already exists

In [114]:
itp_id = 235

In [115]:
trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
     # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
     >> filter(_.calitp_itp_id == itp_id)
     >> filter(_.date.isin(dates.values()))
     >> filter(_.is_in_service == True)
     >> inner_join(_, date_tbl, on = 'date')
     >> collect()
     )

In [116]:
dates.values()

dict_values([datetime.date(2021, 10, 14), datetime.date(2021, 10, 9), datetime.date(2021, 10, 10)])

In [117]:
trips_by_weekday

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name
0,235,2021-10-09,2632585521552177878,10022838,True,Saturday
1,235,2021-10-09,-1566476480924854812,9987864,True,Saturday
2,235,2021-10-09,-5186531131160499550,9987914,True,Saturday
3,235,2021-10-09,1150232814773095553,9987269,True,Saturday
4,235,2021-10-09,-280968012468317142,10022780,True,Saturday
...,...,...,...,...,...,...
6748,235,2021-10-09,-5753475216208881836,10023499,True,Saturday
6749,235,2021-10-09,6185946634073041352,9988008,True,Saturday
6750,235,2021-10-14,201740844516584543,9993622,True,Thursday
6751,235,2021-10-10,7652558882194761480,9805955,True,Sunday


In [118]:
tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
                 )

In [119]:
all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])

In [120]:
tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
    # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
    >> filter(_.calitp_itp_id == itp_id)
    >> select(_.trip_key, _.shape_id, _.route_id)
    >> collect()
)

In [121]:
st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')

## keep middle for each

In [122]:
st_trips_joined

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id,shape_id,route_id
0,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:33:00,74,1763,3510,35
1,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:18:00,61,130,3510,35
2,235,2021-10-09,2632585521552177878,10022838,True,Saturday,12:08:00,4,1363,3510,35
3,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:09:00,53,122,3510,35
4,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:35:00,76,869,3510,35
...,...,...,...,...,...,...,...,...,...,...,...
532031,235,2021-10-14,4179210631142884579,10033537,True,Thursday,19:55:00,7,1341,8602,86
532032,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:32:00,39,3137,8602,86
532033,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:52:00,58,7940,8602,86
532034,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:27:00,35,3133,8602,86


In [123]:
st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])

In [124]:
try:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.shape_id) >> summarize(middle_stop = _.stop_sequence.median())
except:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.route_id) >> summarize(middle_stop = _.stop_sequence.median())
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')

In [125]:
try:
    middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                 >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                )
except:
    middle_st = (middle_stops
             >> select(_.stop_sequence == _.middle_stop, _.route_id)
             >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
            )



In [126]:
# middle_st

In [127]:
# middle_st.departure_time.iloc[0]

In [128]:
middle_st.departure_time = middle_st.departure_time.apply(fix_gtfs_time)
middle_st['departure_dt'] = middle_st['departure_time'].apply(lambda x:
                                                                dt.datetime.strptime(x, '%H:%M:%S'))
middle_st['departure_hour'] = middle_st['departure_dt'].apply(lambda x: x.hour)

#### Shape trip count by day, hour

In [129]:
middle_st['shape_id'].isna().all()

False

In [130]:
middle_st['shape_id'].value_counts()

5312    176
5700    155
6013    138
5004    135
6012    135
       ... 
8212      1
1232      1
1235      1
4322      1
0531      1
Name: shape_id, Length: 87, dtype: int64

In [131]:
if middle_st['shape_id'].isna().all():
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.departure_hour, _.day_name, sort = True)
else:
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.shape_id, _.departure_hour, _.day_name, sort = True)

In [132]:
df

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,n
0,235,53,5312,13,Saturday,8
1,235,57,5700,7,Saturday,6
2,235,60,6013,7,Saturday,6
3,235,53,5312,19,Saturday,6
4,235,53,5312,18,Saturday,6
...,...,...,...,...,...,...
2267,235,57,5770,21,Thursday,1
2268,235,33,3300,17,Thursday,1
2269,235,57,5770,22,Thursday,1
2270,235,33,3300,17,Sunday,1


In [96]:
to_append = pd.DataFrame()

In [73]:
to_append = to_append.append(df)

In [97]:
to_append

In [76]:
to_append.calitp_itp_id.unique()

array([327, 323])