In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt

from utils import *

import calitp
from calitp.tables import tbl
from siuba import *

### Plan

* start from all shapes for a single operator
* for each shape, assess for each hour of day x weekday/sat/sun (with day of week/time of day cols):
    * (can loop over weekday/sat/sun here at trip+stop_times join)
    * existing frequency at midpoint stop (here)
    * current total runtime
* preserve route_id in main table
* (seperately) calculate operator/routes/shapes in each Census tract
* can then join tracts to service info; characterize route service target geographically
    * generate additional frequencies/service hours/service miles for service target
* (optional) generate hypothetical trips table
* (optional) assign service hours/miles to tracts (not sure why we'd need this yet)

In [2]:
dates = get_recent_dates()

In [3]:
dates

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 16),
 'sun': datetime.date(2021, 10, 17)}

In [4]:
# dict(zip(dates.values(), dates.keys()))

In [5]:
# dates.values()

In [6]:
min_date = min(dates.values())
max_date = max(dates.values())

In [7]:
date_tbl = tbl.views.dim_date() >> select(_.date == _.full_date, _.day_name)

### All operators shape/frequency table

In [8]:
def single_operator_shape_frequency(itp_id):
    global _st_trips_joined
    global _debug
    
    trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
        >> filter(_.calitp_itp_id == itp_id)
        >> filter(_.date.isin(dates.values()))
        >> filter(_.is_in_service == True)
        >> inner_join(_, date_tbl, on = 'date')
        >> collect()
        )
    
    tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        ## itp_id 327 gives no results if filtered
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
        )
    
    all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])
    
    tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.trip_key, _.shape_id, _.route_id)
        >> collect()
        )
    
    st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')
    st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])
    _st_trips_joined = st_trips_joined
    # st_trips_joined.stop_id = st_trips_joined.stop_id.astype('int64')
    
    st_trips_joined.departure_time = st_trips_joined.departure_time.apply(fix_gtfs_time)
    st_trips_joined['departure_dt'] = (st_trips_joined['departure_time']
                                 .apply(lambda x:
                                                dt.datetime.strptime(x, '%H:%M:%S'))
                                )
    st_trips_joined['departure_hour'] = st_trips_joined['departure_dt'].apply(lambda x: x.hour)
    
    ## calculate runtimes for each trip, if possible
    try:
        st_with_runtimes = (st_trips_joined
                           >> group_by(_.trip_key)
                           >> summarize(dt_max = _.departure_dt.max(), dt_min = _.departure_dt.min())
                          )

        st_with_runtimes['runtime'] = st_with_runtimes['dt_max'] - st_with_runtimes['dt_min']
        st_with_runtimes = st_with_runtimes >> select(_.trip_key, _.runtime)

    except:
        pass
    
    try:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.shape_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
    except:
        middle_stops = (st_trips_joined
                        >> group_by(_.calitp_itp_id, _.route_id)
                        >> summarize(middle_stop = _.stop_sequence.median())
                       )
        
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')
    
    try:
        middle_st = (middle_stops
                     >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                     >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                    )
    except:
        middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.route_id)
                 >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
                )
        
    _debug = middle_st
    
    ## if multiple trips within the hour, calculate mean runtime
    middle_st_runtimes = (middle_st
     >> inner_join(_, st_with_runtimes, on='trip_key')
     >> group_by(_.calitp_itp_id, _.route_id, _.shape_id, _.departure_hour, _.day_name)
     >> summarize(mean_runtime = _.runtime.mean())
     )

    middle_st_runtimes['mean_runtime_min'] = (middle_st_runtimes.mean_runtime
                                              .apply(lambda x: int(round(x.seconds) / 60))
                                             )
    middle_st_runtimes.drop(columns=['mean_runtime'], inplace = True)

    if middle_st['shape_id'].isna().all():
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.departure_hour, _.day_name, sort = True)
                          )
    else:
        shape_frequency = (middle_st
                           >> count(_.calitp_itp_id, _.route_id,
                                _.shape_id, _.departure_hour, _.day_name, sort = True)
                          )
        
    shape_frequency = shape_frequency >> rename(trips_per_hour = 'n')

    try:
        shape_frequency = shape_frequency >> inner_join(_, middle_st_runtimes, on = [
            'calitp_itp_id', 'day_name', 'shape_id', 'departure_hour', 'route_id']) 
    except:
        print(f' no runtimes for operator {itp_id}')
    print(f' df shape for operator {itp_id}: {shape_frequency.shape}')
    return shape_frequency

In [9]:
itp_ids = tbl.views.gtfs_agency_names() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [10]:
# itp_ids = itp_ids.iloc[:50]
# itp_ids

In [11]:
def multiple_operator_shape_frequency():
    
    shape_frequency = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}...', end='')
            shape_frequency = shape_frequency.append(single_operator_shape_frequency(itp_id))
            # print('done!')
        except:
            print(f'failed for operator {itp_id}')
        
    return shape_frequency

In [12]:
all_operators_shape_frequency = multiple_operator_shape_frequency()

attempting for operator 256...failed for operator 256
attempting for operator 257... df shape for operator 257: (27, 7)
attempting for operator 259... df shape for operator 259: (441, 7)
attempting for operator 4... df shape for operator 4: (6453, 7)
attempting for operator 260... df shape for operator 260: (161, 7)
attempting for operator 261... df shape for operator 261: (12, 7)
attempting for operator 6...failed for operator 6
attempting for operator 263... df shape for operator 263: (40, 7)
attempting for operator 264... df shape for operator 264: (5, 7)
attempting for operator 265... df shape for operator 265: (4, 7)
attempting for operator 10... df shape for operator 10: (6, 7)
attempting for operator 11... df shape for operator 11: (30, 7)
attempting for operator 269... df shape for operator 269: (2211, 7)
attempting for operator 14... df shape for operator 14: (32, 7)
attempting for operator 270... df shape for operator 270: (57, 7)
attempting for operator 15... df shape for op

In [13]:
all_operators_shape_frequency.to_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")



In [14]:
## failures

## not correctable:

## 474: new feed not fully ingested as of key dates
## 256, 273, 289, 62, 325, 344, 349, 97, 111, 372: no records in fact_daily_trips
## 206, 207, 164, 394, 390, 254, 338: no oct '21 service in fact_daily_trips
## 271, 312, 87, : no trips in service on key dates

## correctable (and added):

## 327, 377: extracted/deleted filter excludes valid stop times?
## 323: no shape_id in feed
## 142, 235: some na shape_id in feed
## 228: some na departure_times 

In [15]:
all_operators_shape_frequency = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")
all_operators_shape_frequency.head(3)

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,mean_runtime_min
0,257,66,13737,6,Thursday,4,49
1,257,673,13738,11,Thursday,4,28
2,257,673,13738,9,Thursday,4,28


### Joined to funding source?

In [17]:
with_funding = (tbl.views.transitstacks()
                  >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, _.transit_provider, _._5307_funds, _._5311_funds)
                  >> collect()
                  >> right_join(_, all_operators_shape_frequency, on = 'calitp_itp_id')
               )

In [25]:
# with_funding._5307_funds.unique()

In [19]:
int('$10,546,704'.replace('$', '').replace(',', ''))

10546704

In [26]:
def fix_funds(value):
    if type(value) != str:
        return None
    else:
        return int(value.replace('$', '').replace(',', ''))

In [27]:
with_funding._5307_funds = with_funding._5307_funds.apply(fix_funds)

In [28]:
with_funding._5311_funds = with_funding._5311_funds.apply(fix_funds)

In [30]:
with_funding = with_funding.rename(columns = {'n':'trips_per_hr'})
with_funding.head(5)

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,route_id,shape_id,departure_hour,day_name,trips_per_hour,mean_runtime_min
0,257,,PresidioGo Shuttle,,,66,13737,6,Thursday,4,49
1,257,,PresidioGo Shuttle,,,673,13738,11,Thursday,4,28
2,257,,PresidioGo Shuttle,,,673,13738,9,Thursday,4,28
3,257,,PresidioGo Shuttle,,,673,13738,8,Thursday,4,28
4,257,,PresidioGo Shuttle,,,673,13738,7,Thursday,4,28


In [32]:
(with_funding
 >> filter(_.calitp_itp_id == 300, _.day_name == 'Thursday',
           _.departure_hour.isin(list(range(5, 23))),
           _.trips_per_hour < 4
          )
).drop_duplicates(subset=['calitp_itp_id', 'shape_id', 'departure_hour', 'day_name'])

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,route_id,shape_id,departure_hour,day_name,trips_per_hour,mean_runtime_min
32003,300,90008,Big Blue Bus,,,3335,25337,9,Thursday,3,52
32006,300,90008,Big Blue Bus,,,3345,25379,7,Thursday,3,54
32007,300,90008,Big Blue Bus,,,3335,25337,18,Thursday,3,60
32008,300,90008,Big Blue Bus,,,3335,25337,16,Thursday,3,67
32010,300,90008,Big Blue Bus,,,3338,25351,21,Thursday,3,17
...,...,...,...,...,...,...,...,...,...,...,...
32931,300,90008,Big Blue Bus,,,3337,25348,6,Thursday,1,48
32932,300,90008,Big Blue Bus,,,3336,25403,15,Thursday,1,32
32933,300,90008,Big Blue Bus,,,3336,25403,7,Thursday,1,24
32934,300,90008,Big Blue Bus,,,3336,25346,21,Thursday,1,20


In [47]:
(tbl.views.gtfs_schedule_dim_routes()
 >> filter(_.calitp_itp_id == 182)
 >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
 >> filter(_.route_id == '16-13149')
)


Unnamed: 0,route_key,calitp_itp_id,calitp_url_number,route_id,route_type,agency_id,route_short_name,route_long_name,route_desc,route_url,...,route_continuous_drop_off,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_deleted_at
0,-1999584540119743372,182,0,16-13149,3,,16,Metro Local Line,DOWNTOWN LA - WEST HOLLYWOOD VIA WEST 3RD ST,,...,,,,,,,,,2021-09-11,2099-01-01


In [50]:
with_funding >> filter(_.calitp_itp_id == 282) >> arrange(-_.trips_per_hour)

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,route_id,shape_id,departure_hour,day_name,trips_per_hour,mean_runtime_min
11232,282,90015,MUNI,,,17298,191981,11,Thursday,10,54
11233,282,90015,MUNI,,,17298,191981,17,Thursday,10,55
11234,282,90015,MUNI,,,17298,191981,9,Thursday,10,54
11235,282,90015,MUNI,,,17298,191981,10,Thursday,10,54
11236,282,90015,MUNI,,,17298,191988,17,Thursday,10,54
...,...,...,...,...,...,...,...,...,...,...,...
18183,282,90015,MUNI,,,17306,192040,11,Thursday,1,39
18184,282,90015,MUNI,,,17306,192040,8,Sunday,1,35
18185,282,90015,MUNI,,,17306,192040,8,Saturday,1,35
18186,282,90015,MUNI,,,17306,192038,22,Thursday,1,27


In [41]:
with_funding >> group_by(_.calitp_itp_id) >> summarize(max_trips = _.trips_per_hour.max()) >> arrange(-_.max_trips)

Unnamed: 0,calitp_itp_id,max_trips
115,281,15
74,183,12
73,182,11
116,282,10
0,4,8
...,...,...
134,331,1
135,334,1
137,337,1
148,374,1


### Single Operator Test

In [103]:
itp_id = 300 ##BBB

In [104]:
bbb = single_operator_shape_frequency(itp_id)

 df shape for operator 300: (1206, 7)


In [105]:
bbb

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,mean_runtime_min
0,300,3328,25315,18,Thursday,7,58
1,300,3328,25314,9,Thursday,6,50
2,300,3328,25315,17,Thursday,6,67
3,300,3328,25315,16,Thursday,6,70
4,300,3328,25315,15,Thursday,6,70
...,...,...,...,...,...,...,...
1201,300,3336,25346,19,Sunday,1,29
1202,300,3336,25346,19,Saturday,1,29
1203,300,3336,25346,17,Sunday,1,31
1204,300,3336,25346,17,Saturday,1,31


### Sandbox

In [67]:
itp_id = 300

In [68]:
trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
     # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
     >> filter(_.calitp_itp_id == itp_id)
     >> filter(_.date.isin(dates.values()))
     >> filter(_.is_in_service == True)
     >> inner_join(_, date_tbl, on = 'date')
     >> collect()
     )

In [69]:
dates.values()

dict_values([datetime.date(2021, 10, 14), datetime.date(2021, 10, 16), datetime.date(2021, 10, 17)])

In [70]:
trips_by_weekday

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name
0,300,2021-10-14,-1748735243863240841,860315,True,Thursday
1,300,2021-10-14,-9142399805620392517,860392,True,Thursday
2,300,2021-10-14,7132077352117529923,860355,True,Thursday
3,300,2021-10-14,-872861419075307621,860360,True,Thursday
4,300,2021-10-14,-4410192383831118295,859824,True,Thursday
...,...,...,...,...,...,...
3119,300,2021-10-16,4764506337278467300,856901,True,Saturday
3120,300,2021-10-17,-3768651174800258851,857047,True,Sunday
3121,300,2021-10-16,4562748415292443159,856928,True,Saturday
3122,300,2021-10-17,2998609267394549501,857056,True,Sunday


In [71]:
tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
                 )

In [72]:
all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])

In [73]:
all_days_st

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id
0,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:19:03,3,349
1,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:59:00,7,1301
2,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:10:00,1,1344
3,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:27:00,4,762
4,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:49:58,5,1308
...,...,...,...,...,...,...,...,...,...
113669,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:15:00,31,621
113670,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:21:52,40,825
113671,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:23:12,42,827
113672,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:30:00,51,431


In [74]:
tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
    # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
    >> filter(_.calitp_itp_id == itp_id)
    >> select(_.trip_key, _.shape_id, _.route_id)
    >> collect()
)

In [75]:
st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')

## keep middle for each

In [76]:
st_trips_joined

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_sequence,stop_id,shape_id,route_id
0,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:19:03,3,349,25399,3352
1,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:59:00,7,1301,25399,3352
2,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:10:00,1,1344,25399,3352
3,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:27:00,4,762,25399,3352
4,300,2021-10-14,-1748735243863240841,860315,True,Thursday,07:49:58,5,1308,25399,3352
...,...,...,...,...,...,...,...,...,...,...,...
113669,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:15:00,31,621,25333,3334
113670,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:21:52,40,825,25333,3334
113671,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:23:12,42,827,25333,3334
113672,300,2021-10-16,-4803176454895321919,856941,True,Saturday,07:30:00,51,431,25333,3334


In [77]:
st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])

In [78]:
try:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.shape_id) >> summarize(middle_stop = _.stop_sequence.median())
except:
    middle_stops = st_trips_joined >> group_by(_.calitp_itp_id, _.route_id) >> summarize(middle_stop = _.stop_sequence.median())
    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')

In [79]:
try:
    middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                 >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                )
except:
    middle_st = (middle_stops
             >> select(_.stop_sequence == _.middle_stop, _.route_id)
             >> inner_join(_, st_trips_joined, on=['route_id', 'stop_sequence'])
            )



In [80]:
middle_st

Unnamed: 0,stop_sequence,shape_id,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_id,route_id
0,16.0,25311,300,2021-10-14,-6634353861742551059,856000,True,Thursday,05:28:00,1637,3328
1,16.0,25311,300,2021-10-14,-8040042882390732694,855999,True,Thursday,06:13:00,1637,3328
2,23.0,25314,300,2021-10-14,681071274871460600,856007,True,Thursday,17:33:38,149,3328
3,23.0,25314,300,2021-10-14,-8346954595234672876,856045,True,Thursday,11:13:54,149,3328
4,23.0,25314,300,2021-10-14,-6469852147244236822,856049,True,Thursday,10:32:07,149,3328
...,...,...,...,...,...,...,...,...,...,...,...
1747,4.0,25402,300,2021-10-14,-1222909712001896713,860358,True,Thursday,08:14:00,621,3352
1748,4.0,25402,300,2021-10-14,3249002311165996258,860357,True,Thursday,07:48:00,621,3352
1749,4.0,25402,300,2021-10-14,-9023177547330832211,860356,True,Thursday,07:23:00,621,3352
1750,17.0,25403,300,2021-10-14,-5970800398583194665,857571,True,Thursday,07:02:35,1147,3336


In [81]:
# middle_st.departure_time.iloc[0]

In [82]:
middle_st.departure_time = middle_st.departure_time.apply(fix_gtfs_time)
middle_st['departure_dt'] = middle_st['departure_time'].apply(lambda x:
                                                                dt.datetime.strptime(x, '%H:%M:%S'))
middle_st['departure_hour'] = middle_st['departure_dt'].apply(lambda x: x.hour)

#### Shape trip count by day, hour

In [83]:
middle_st['shape_id'].isna().all()

False

In [84]:
middle_st['shape_id'].value_counts()

25326    202
25333    180
25352    155
25318    135
25354    115
25315     76
25369     76
25314     74
25324     74
25339     70
25346     68
25343     66
25382     39
25323     36
25399     36
25400     36
25380     31
25376     31
25340     30
25337     29
25363     28
25372     27
25332     24
25385     18
25384     17
25362     13
25328     11
25359      9
25375      8
25402      6
25379      6
25357      5
25371      4
25398      4
25311      2
25403      2
25322      2
25319      2
25316      2
25344      1
25330      1
25353      1
Name: shape_id, dtype: int64

In [85]:
if middle_st['shape_id'].isna().all():
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.departure_hour, _.day_name, sort = True)
else:
    df = middle_st >> count(_.calitp_itp_id, _.route_id, _.shape_id, _.departure_hour, _.day_name, sort = True)

In [86]:
df.sort_values('n', ascending=False)

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,n
0,300,3328,25315,18,Thursday,7
11,300,3328,25314,17,Thursday,6
1,300,3328,25315,9,Thursday,6
19,300,3328,25315,17,Thursday,6
18,300,3328,25314,8,Thursday,6
...,...,...,...,...,...,...
580,300,3334,25333,7,Thursday,1
579,300,3336,25343,11,Sunday,1
578,300,3334,25333,7,Sunday,1
577,300,3334,25332,17,Thursday,1


In [87]:
(tbl.views.gtfs_schedule_dim_routes() 
     >> filter(_.calitp_itp_id == 300)
     >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> filter(_.route_id == '3328')
)


Unnamed: 0,route_key,calitp_itp_id,calitp_url_number,route_id,route_type,agency_id,route_short_name,route_long_name,route_desc,route_url,...,route_continuous_drop_off,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_deleted_at
0,5298754852876447831,300,0,3328,3,6216179,1,Main St & Santa Monica Blvd/UCLA,,http://bigbluebus.com/Routes-and-Schedules/Rou...,...,,Big Blue Bus,http://www.bigbluebus.com,America/Los_Angeles,en,310-451-5444,,,2021-07-27,2099-01-01


In [91]:
(tbl.views.gtfs_schedule_dim_trips() 
     >> filter(_.calitp_itp_id == 300)
     >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> filter(_.trip_id == '856091')
)

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,service_id,trip_id,shape_id,trip_headsign,trip_short_name,direction_id,block_id,wheelchair_accessible,bikes_allowed,calitp_extracted_at,calitp_hash,trip_key,calitp_deleted_at
0,300,0,3328,10,856091,25315,Venice,,1,103552,0,0,2021-10-11,QrQd48438fK+U3fcDRk/hg==,5470659630283243708,2021-10-19


In [93]:
(tbl.views.gtfs_schedule_dim_stops() 
     >> filter(_.calitp_itp_id == 300)
     >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
     >> filter(_.stop_id == '325')
)

Unnamed: 0,calitp_itp_id,calitp_url_number,stop_id,tts_stop_name,stop_lat,stop_lon,zone_id,parent_station,stop_code,stop_name,...,stop_url,location_type,stop_timezone,wheelchair_boarding,level_id,platform_code,calitp_extracted_at,calitp_hash,stop_key,calitp_deleted_at
0,300,0,325,,34.031217,-118.476979,,,1394,SANTA MONICA WB & CLOVERFIELD NS,...,,,,0,,,2021-05-13,LRMitd/p0SP9DDN8DWg1FQ==,6531600027599400837,2099-01-01


In [90]:
middle_st >> filter(_.shape_id == '25315') >> filter(_.day_name == 'Thursday') >> filter(_.departure_hour == 18)

Unnamed: 0,stop_sequence,shape_id,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,departure_time,stop_id,route_id,departure_dt,departure_hour
85,23.0,25315,300,2021-10-14,5470659630283243708,856091,True,Thursday,18:00:39,325,3328,1900-01-01 18:00:39,18
109,23.0,25315,300,2021-10-14,203302734887259271,856089,True,Thursday,18:18:15,325,3328,1900-01-01 18:18:15,18
117,23.0,25315,300,2021-10-14,1507436588030690577,856088,True,Thursday,18:28:15,325,3328,1900-01-01 18:28:15,18
119,23.0,25315,300,2021-10-14,1426302977681668789,856087,True,Thursday,18:38:15,325,3328,1900-01-01 18:38:15,18
125,23.0,25315,300,2021-10-14,6184547606452067601,856086,True,Thursday,18:47:15,325,3328,1900-01-01 18:47:15,18
132,23.0,25315,300,2021-10-14,-7187667218607631543,856090,True,Thursday,18:09:15,325,3328,1900-01-01 18:09:15,18
138,23.0,25315,300,2021-10-14,-6757840535875417683,856085,True,Thursday,18:57:15,325,3328,1900-01-01 18:57:15,18


In [94]:
# st_trips_joined >> filter(_.shape_id == '6413')

In [None]:
def spot_checker(output_df):
    '''
    sample routes/operators/times
    query warehouse for stop_times, stops, routes w/ schedule link
    (manually check those)
    '''
    

In [95]:
## verified OCTA and BBB highest frequency routes!


In [53]:
# to_append = to_append.append(df)