In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt

from utils import *

import calitp
from calitp.tables import tbl
from siuba import *

### Plan

* start from all shapes for a single operator
* for each shape, assess for each hour of day x weekday/sat/sun (with day of week/time of day cols):
    * (can loop over weekday/sat/sun here at trip+stop_times join)
    * existing frequency at midpoint stop (here)
    * current total runtime
* preserve route_id in main table
* (seperately) calculate operator/routes/shapes in each Census tract
* can then join tracts to service info; characterize route service target geographically
    * generate additional frequencies/service hours/service miles for service target
* (optional) generate hypothetical trips table
* (optional) assign service hours/miles to tracts (not sure why we'd need this yet)

In [2]:
dates = get_recent_dates()

In [3]:
dates

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 9),
 'sun': datetime.date(2021, 10, 10)}

In [4]:
# dict(zip(dates.values(), dates.keys()))

In [5]:
# dates.values()

In [6]:
min_date = min(dates.values())
max_date = max(dates.values())

In [7]:
date_tbl = tbl.views.dim_date() >> select(_.date == _.full_date, _.day_name)

### All operators shape/frequency table

In [74]:
def single_operator_shape_frequency(itp_id):
    global _st_trips_joined
    
    trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
        >> filter(_.calitp_itp_id == itp_id)
        >> filter(_.date.isin(dates.values()))
        >> filter(_.is_in_service == True)
        >> inner_join(_, date_tbl, on = 'date')
        >> collect()
        )
    
    tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        ## itp_id 327 gives no results if filtered
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
        )
    
    all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])
    
    tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.trip_key, _.shape_id, _.route_id)
        >> collect()
        )
    
    st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')
    st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])
    _st_trips_joined = st_trips_joined
    # st_trips_joined.stop_id = st_trips_joined.stop_id.astype('int64')
    
    st_trips_joined.departure_time = st_trips_joined.departure_time.apply(fix_gtfs_time)
    st_trips_joined['departure_dt'] = (st_trips_joined['departure_time']
                                 .apply(lambda x:
                                                dt.datetime.strptime(x, '%H:%M:%S'))
                                )
    st_trips_joined['departure_hour'] = st_trips_joined['departure_dt'].apply(lambda x: x.hour)
    
    ## calculate runtimes for each shape at each hour, if possible
    ## TODO join in day of week too...
    try:
        one_trip = st_trips_joined >> distinct(_.departure_hour, _.shape_id, _keep_all = True)
        one_trip_all_st = st_trips_joined >> filter(_.trip_id.isin(one_trip.trip_id))
        one_trip_all_st = (one_trip_all_st
                           >> group_by(_.trip_key)
                           >> summarize(dt_max = _.departure_dt.max(), dt_min = _.departure_dt.min())
                          )

        one_trip_all_st['runtime'] = one_trip_all_st['dt_max'] - one_trip_all_st['dt_min']
        runtimes = (st_trips_joined
         >> select(_.shape_id, _.route_id, _.trip_key, _.departure_hour)
         >> inner_join(_, one_trip_all_st >> select(_.trip_key, _.runtime), on = 'trip_key')
         >> select(_.shape_id, _.departure_hour, _.route_id, _.runtime)
        )
    except:
        pass
    
    try:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.shape_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
    except:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.route_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
        
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')
    
    try:
        middle_st = (middle_stops
                     >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                     >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                    )
    except:
        middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.route_id)
                 >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
                )
    
    if middle_st['shape_id'].isna().all():
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.departure_hour, _.day_name, sort = True)
                          )
    else:
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.shape_id, _.departure_hour, _.day_name, sort = True)
                          )
    
    try:
        shape_frequency = shape_frequency >> inner_join(_, runtimes, on = ['shape_id', 'departure_hour', 'route_id']) 
    except:
        print(f'no runtimes for operator {itp_id}')
    return shape_frequency

In [75]:
itp_ids = tbl.views.gtfs_agency_names() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [76]:
itp_ids = itp_ids.iloc[:50]
# itp_ids

In [77]:
def multiple_operator_shape_frequency():
    
    shape_frequency = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}...', end='')
            shape_frequency = shape_frequency.append(single_operator_shape_frequency(itp_id))
            print('done!')
        except:
            print(f'failed for operator {itp_id}')
        
    return shape_frequency

In [78]:
all_operators_shape_frequency = multiple_operator_shape_frequency()

attempting for operator 256...failed for operator 256
attempting for operator 257...done!
attempting for operator 259...done!
attempting for operator 4...done!
attempting for operator 260...done!
attempting for operator 261...done!
attempting for operator 6...no runtimes for operator 6
done!
attempting for operator 263...done!
attempting for operator 264...done!
attempting for operator 265...done!
attempting for operator 10...done!
attempting for operator 11...done!
attempting for operator 269...done!
attempting for operator 14...done!
attempting for operator 270...done!
attempting for operator 15...done!
attempting for operator 271...failed for operator 271
attempting for operator 17...done!
attempting for operator 273...failed for operator 273
attempting for operator 18...done!
attempting for operator 274...done!
attempting for operator 21...done!
attempting for operator 278...done!
attempting for operator 23...done!
attempting for operator 279...done!
attempting for operator 24...do

In [88]:
all_operators_shape_frequency['runtime_min'] = all_operators_shape_frequency['runtime'].apply(lambda x: x.seconds / 60)

In [91]:
all_operators_shape_frequency = all_operators_shape_frequency.drop(columns=['runtime'])

In [92]:
all_operators_shape_frequency.to_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")

In [13]:
## failures

## not correctable:

## 474: new feed not fully ingested as of key dates
## 256, 273, 289, 62, 325, 344, 349, 97, 111, 372: no records in fact_daily_trips
## 206, 207, 164, 394, 390, 254, 338: no oct '21 service in fact_daily_trips
## 271, 312, 87, : no trips in service on key dates

## correctable (and added):

## 327, 377: extracted/deleted filter excludes valid stop times?
## 323: no shape_id in feed
## 142, 235: some na shape_id in feed
## 228: some na departure_times 

In [93]:
all_operators_shape_frequency = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")
all_operators_shape_frequency.head(3)

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,n,runtime_min
0,257,66,13737,6,Thursday,4,44.0
1,257,66,13737,6,Thursday,4,44.0
2,257,66,13737,6,Thursday,4,44.0


### Joined to funding source?

In [101]:
with_funding = (tbl.views.transitstacks()
                  >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, _.transit_provider, _._5307_funds, _._5311_funds)
                  >> collect()
                  >> inner_join(_, all_operators_shape_frequency, on = 'calitp_itp_id')
               )

In [107]:
with_funding._5307_funds.unique()

array(['$0', None, '$48,008,198', '$7,962,994', '$10,465,001',
       '$6,304,429', '$16,496,453', '$5,065,621', '$43,522,188',
       '$48,944,753', '$3,395,434', '$10,546,704', '$54,512,163',
       '$33,978,900', '$1,541,302', '$1,500,000', '$1,314,201',
       '$10,218,408', '$3,602,247', '$5,403,469'], dtype=object)

In [108]:
int('$10,546,704'.replace('$', '').replace(',', ''))

10546704

In [110]:
def fix_funds(value):
    if value == None:
        return None
    else:
        return int(value.replace('$', '').replace(',', ''))

In [111]:
with_funding._5307_funds = with_funding._5307_funds.apply(fix_funds)

In [112]:
with_funding._5311_funds = with_funding._5311_funds.apply(fix_funds)

In [119]:
with_funding = with_funding.rename(columns = {'n':'trips_per_hr'})
with_funding

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,route_id,shape_id,departure_hour,day_name,trips_per_hr,runtime_min
0,30,,Banning Pass Transit,0.0,0.0,355,p_1274755,5,Thursday,1,9.0
1,30,,Banning Pass Transit,0.0,0.0,355,p_1274755,5,Thursday,1,9.0
2,30,,Banning Pass Transit,0.0,0.0,356,p_786666,6,Thursday,1,45.0
3,30,,Banning Pass Transit,0.0,0.0,356,p_786666,6,Thursday,1,45.0
4,30,,Banning Pass Transit,0.0,0.0,356,p_786666,6,Thursday,1,45.0
...,...,...,...,...,...,...,...,...,...,...,...
1891802,42,99292,Blue Lake Rancheria,,,822,p_899577,16,Saturday,1,110.0
1891803,42,99292,Blue Lake Rancheria,,,822,p_899577,16,Saturday,1,110.0
1891804,42,99292,Blue Lake Rancheria,,,822,p_899577,16,Saturday,1,110.0
1891805,42,99292,Blue Lake Rancheria,,,822,p_899577,16,Saturday,1,110.0


In [125]:
list(range(5, 23))

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [126]:
(with_funding
 >> filter(_.calitp_itp_id == 300, _.day_name == 'Thursday',
           _.departure_hour.isin(list(range(5, 23))),
           _.trips_per_hr < 4
          )
).drop_duplicates(subset=['calitp_itp_id', 'shape_id', 'departure_hour', 'day_name'])

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,route_id,shape_id,departure_hour,day_name,trips_per_hr,runtime_min
83096,300,90008,Big Blue Bus,10465001.0,0.0,3334,25333,8,Thursday,3,69.000000
86438,300,90008,Big Blue Bus,10465001.0,0.0,3334,25333,14,Thursday,3,70.000000
86866,300,90008,Big Blue Bus,10465001.0,0.0,3339,25358,13,Thursday,2,36.000000
87238,300,90008,Big Blue Bus,10465001.0,0.0,3339,25358,16,Thursday,2,45.000000
87339,300,90008,Big Blue Bus,10465001.0,0.0,3334,25333,17,Thursday,3,70.000000
...,...,...,...,...,...,...,...,...,...,...,...
112735,300,90008,Big Blue Bus,10465001.0,0.0,3337,25348,6,Thursday,1,48.533333
112767,300,90008,Big Blue Bus,10465001.0,0.0,3336,25403,15,Thursday,1,32.000000
112800,300,90008,Big Blue Bus,10465001.0,0.0,3336,25403,7,Thursday,1,24.000000
112819,300,90008,Big Blue Bus,10465001.0,0.0,3336,25346,21,Thursday,1,26.816667


### Current Runtime Estimates

In [58]:
itp_id = 300

In [61]:
bbb = single_operator_shape_frequency(itp_id)

In [79]:
_st_trips_joined.departure_time = _st_trips_joined.departure_time.apply(fix_gtfs_time)
_st_trips_joined['departure_dt'] = (_st_trips_joined['departure_time']
                             .apply(lambda x:
                                            dt.datetime.strptime(x, '%H:%M:%S'))
                            )
_st_trips_joined['departure_hour'] = _st_trips_joined['departure_dt'].apply(lambda x: x.hour)
# _st_trips_joined

In [51]:
one_trip = _st_trips_joined >> distinct(_.departure_hour, _.shape_id, _keep_all = True)

one_trip_all_st = _st_trips_joined >> filter(_.trip_id.isin(one_trip.trip_id))

one_trip_all_st = (one_trip_all_st
                   >> group_by(_.trip_key)
                   >> summarize(dt_max = _.departure_dt.max(), dt_min = _.departure_dt.min())
                  )

one_trip_all_st['runtime'] = one_trip_all_st['dt_max'] - one_trip_all_st['dt_min']

df = (_st_trips_joined
 >> select(_.shape_id, _.route_id, _.trip_key, _.departure_hour)
 >> inner_join(_, one_trip_all_st >> select(_.trip_key, _.runtime), on = 'trip_key')
) >> select(_.shape_id, _.departure_hour, _.route_id, _.runtime)

### Sandbox

In [22]:
single_operator_shape_frequency(327)

ValueError: cannot insert calitp_itp_id, already exists

In [127]:
itp_id = 235

In [128]:
trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
     # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
     >> filter(_.calitp_itp_id == itp_id)
     >> filter(_.date.isin(dates.values()))
     >> filter(_.is_in_service == True)
     >> inner_join(_, date_tbl, on = 'date')
     >> collect()
     )

In [129]:
dates.values()

dict_values([datetime.date(2021, 10, 14), datetime.date(2021, 10, 9), datetime.date(2021, 10, 10)])

In [131]:
trips_by_weekday

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name
0,235,2021-10-09,2632585521552177878,10022838,True,Saturday
1,235,2021-10-09,-1566476480924854812,9987864,True,Saturday
2,235,2021-10-09,-5186531131160499550,9987914,True,Saturday
3,235,2021-10-09,1150232814773095553,9987269,True,Saturday
4,235,2021-10-09,-280968012468317142,10022780,True,Saturday
...,...,...,...,...,...,...
6748,235,2021-10-09,-5753475216208881836,10023499,True,Saturday
6749,235,2021-10-09,6185946634073041352,9988008,True,Saturday
6750,235,2021-10-14,201740844516584543,9993622,True,Thursday
6751,235,2021-10-10,7652558882194761480,9805955,True,Sunday


In [132]:
tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
                 )

In [133]:
all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])

In [134]:
all_days_st

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id
0,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:33:00,74,1763
1,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:18:00,61,130
2,235,2021-10-09,2632585521552177878,10022838,True,Saturday,12:08:00,4,1363
3,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:09:00,53,122
4,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:35:00,76,869
...,...,...,...,...,...,...,...,...,...
532031,235,2021-10-14,4179210631142884579,10033537,True,Thursday,19:55:00,7,1341
532032,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:32:00,39,3137
532033,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:52:00,58,7940
532034,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:27:00,35,3133


In [135]:
tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
    # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
    >> filter(_.calitp_itp_id == itp_id)
    >> select(_.trip_key, _.shape_id, _.route_id)
    >> collect()
)

In [136]:
st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')

## keep middle for each

In [137]:
st_trips_joined

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id,shape_id,route_id
0,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:33:00,74,1763,3510,35
1,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:18:00,61,130,3510,35
2,235,2021-10-09,2632585521552177878,10022838,True,Saturday,12:08:00,4,1363,3510,35
3,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:09:00,53,122,3510,35
4,235,2021-10-09,2632585521552177878,10022838,True,Saturday,13:35:00,76,869,3510,35
...,...,...,...,...,...,...,...,...,...,...,...
532031,235,2021-10-14,4179210631142884579,10033537,True,Thursday,19:55:00,7,1341,8602,86
532032,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:32:00,39,3137,8602,86
532033,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:52:00,58,7940,8602,86
532034,235,2021-10-14,4179210631142884579,10033537,True,Thursday,20:27:00,35,3133,8602,86


In [138]:
st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])

In [139]:
try:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.shape_id) >> summarize(middle_stop = _.stop_sequence.median())
except:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.route_id) >> summarize(middle_stop = _.stop_sequence.median())
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')

In [140]:
try:
    middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                 >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                )
except:
    middle_st = (middle_stops
             >> select(_.stop_sequence == _.middle_stop, _.route_id)
             >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
            )



In [141]:
middle_st

Unnamed: 0,stop_sequence,shape_id,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_id,route_id
0,66.0,0101,235,2021-10-14,6123958717698031528,9852253,True,Thursday,17:30:00,5228,1
1,66.0,0101,235,2021-10-14,-3869944417170861420,9852260,True,Thursday,20:26:00,5228,1
2,66.0,0101,235,2021-10-14,4778973443419298296,9852256,True,Thursday,18:26:00,5228,1
3,66.0,0101,235,2021-10-09,3995767484419956054,9987177,True,Saturday,18:26:00,5228,1
4,66.0,0101,235,2021-10-09,3995767484419956054,9987177,True,Saturday,18:26:00,5228,1
...,...,...,...,...,...,...,...,...,...,...,...
4164,14.0,9601,235,2021-10-14,6787171304683496423,9851263,True,Thursday,13:34:01,7532,560
4165,14.0,9601,235,2021-10-14,927870309732505276,9851252,True,Thursday,09:09:00,7532,560
4166,14.0,9601,235,2021-10-14,6976045128885390637,9851298,True,Thursday,17:37:01,7532,560
4167,14.0,9601,235,2021-10-14,3665296140575888931,9851253,True,Thursday,09:33:00,7532,560


In [127]:
# middle_st.departure_time.iloc[0]

In [142]:
middle_st.departure_time = middle_st.departure_time.apply(fix_gtfs_time)
middle_st['departure_dt'] = middle_st['departure_time'].apply(lambda x:
                                                                dt.datetime.strptime(x, '%H:%M:%S'))
middle_st['departure_hour'] = middle_st['departure_dt'].apply(lambda x: x.hour)

#### Shape trip count by day, hour

In [143]:
middle_st['shape_id'].isna().all()

False

In [144]:
middle_st['shape_id'].value_counts()

5312    176
5700    155
6013    138
5004    135
6012    135
       ... 
8212      1
1232      1
1235      1
4322      1
0531      1
Name: shape_id, Length: 87, dtype: int64

In [145]:
if middle_st['shape_id'].isna().all():
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.departure_hour, _.day_name, sort = True)
else:
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.shape_id, _.departure_hour, _.day_name, sort = True)

In [148]:
df.sort_values('n', ascending=False)

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,n
0,235,53,5312,13,Saturday,8
13,235,60,6013,8,Saturday,6
23,235,53,5312,8,Saturday,6
22,235,543,9431,13,Saturday,6
21,235,53,5312,7,Saturday,6
...,...,...,...,...,...,...
1604,235,86,8603,11,Thursday,1
1605,235,150,1502,13,Thursday,1
1606,235,150,1502,12,Thursday,1
1607,235,129,1605,12,Sunday,1


In [96]:
to_append = pd.DataFrame()

In [73]:
to_append = to_append.append(df)

In [97]:
to_append

In [76]:
to_append.calitp_itp_id.unique()

array([327, 323])

In [149]:
get_recent_dates()

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 9),
 'sun': datetime.date(2021, 10, 10)}