In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt

from utils import *

import calitp
from calitp.tables import tbl
from siuba import *

### Plan

* start from all shapes for a single operator
* for each shape, assess for each hour of day x weekday/sat/sun (with day of week/time of day cols):
    * (can loop over weekday/sat/sun here at trip+stop_times join)
    * existing frequency at midpoint stop (here)
    * current total runtime
* preserve route_id in main table
* (seperately) calculate operator/routes/shapes in each Census tract
* can then join tracts to service info; characterize route service target geographically
    * generate additional frequencies/service hours/service miles for service target
* (optional) generate hypothetical trips table
* (optional) assign service hours/miles to tracts (not sure why we'd need this yet)

In [2]:
dates = get_recent_dates()
min_date = min(dates.values())
max_date = max(dates.values())
dates

{'thurs': datetime.date(2021, 10, 14),
 'sat': datetime.date(2021, 10, 16),
 'sun': datetime.date(2021, 10, 17)}

In [3]:
date_tbl = tbl.views.dim_date() >> select(_.date == _.full_date, _.day_name)

### All operators shape/frequency table

In [8]:
def single_operator_shape_frequency(itp_id):
    global shape_frequency
    global multi_ix
    
    ## get trips for operator on dates of interest, join with day of week table
    trips_by_weekday = (tbl.views.gtfs_schedule_fact_daily_trips()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> select(_.calitp_itp_id, _.date == _.service_date, _.trip_key, _.trip_id, _.is_in_service)
        >> filter(_.calitp_itp_id == itp_id)
        >> filter(_.date.isin(dates.values()))
        >> filter(_.is_in_service == True)
        >> inner_join(_, date_tbl, on = 'date')
        >> collect()
        )
    assert trips_by_weekday.size != 0, "zero records in filtered trips fact table"
    
    ## get stop times for operator
    tbl_stop_times = (tbl.views.gtfs_schedule_dim_stop_times()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        ## itp_id 327 gives no results if filtered
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.calitp_itp_id, _.trip_id, _.departure_time,
                  _.stop_sequence, _.stop_id)
        >> collect()
        )
    assert tbl_stop_times.size != 0, "zero records in filtered stop times dim table"

    
    ## join stop times to trips by weekday
    all_days_st = trips_by_weekday >> inner_join(_, tbl_stop_times, on = ['calitp_itp_id', 'trip_id'])
    
    ## get trips dimensional table
    tbl_trips = (tbl.views.gtfs_schedule_dim_trips()
        >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.trip_key, _.shape_id, _.route_id)
        >> collect()
        )
    assert tbl_trips.size != 0, "zero records in filtered trips dim table"
    assert tbl_trips.shape_id.isnull().values.any() == False, "at least 1 trip has no shape_id"

    ## join dim_trips info to stop times
    st_trips_joined = all_days_st >> inner_join(_, tbl_trips, on = 'trip_key')
    
    ## time calculations
    st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])
    # _st_trips_joined = st_trips_joined
    st_trips_joined.departure_time = st_trips_joined.departure_time.apply(fix_gtfs_time)
    st_trips_joined['departure_dt'] = (st_trips_joined['departure_time']
                                 .apply(lambda x:
                                        dt.datetime.strptime(x, '%H:%M:%S'))
                                      )
    st_trips_joined['departure_hour'] = st_trips_joined['departure_dt'].apply(lambda x: x.hour)
    
    ## calculate runtimes for each trip, if possible
    def find_runtime(df):
        mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_dt.iloc[0]
        maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_dt.iloc[0]
        td = (maxdt - mindt)
        df['runtime_seconds'] = td.seconds
        return df
    
    # try:
    st_with_runtimes = st_trips_joined.groupby(['trip_key', 'day_name']).apply(find_runtime)
    st_with_runtimes = st_with_runtimes >> select(_.trip_key, _.day_name, _.runtime_seconds)
    
    ## find middle stop for each trip to calculate frequencies
    middle_stops = (st_trips_joined
                    >> group_by(_.calitp_itp_id, _.shape_id)
                    >> summarize(middle_stop = _.stop_sequence.median())
                   )
    
#     ## possible solve for non-consecutive stop sequences, but too slow
#     def find_middle_stop(df):
#         subtracted = abs(df['stop_sequence'] - df['stop_sequence'].median())
#         ix = subtracted[subtracted == subtracted.min()].drop_duplicates().index[0]
#         df['middle_stop'] = df['stop_sequence'].loc[ix]
#         return df
    
#     middle_stops = st_trips_joined.groupby(['calitp_itp_id', 'shape_id']).apply(find_middle_stop)

    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')
    
    # try:
    middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                 >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                )
    assert middle_st.size != 0, "zero stop times for middle stop, may be non-consecutive"

    ## if multiple trips within the hour, calculate mean runtime
    middle_st_runtimes = (middle_st
     >> inner_join(_, st_with_runtimes, on=['trip_key', 'day_name'])
     >> group_by(_.calitp_itp_id, _.route_id, _.shape_id, _.departure_hour, _.day_name)
     >> summarize(mean_runtime_sec = _.runtime_seconds.mean())
     )

    middle_st_runtimes['mean_runtime_min'] = (middle_st_runtimes.mean_runtime_sec
                                              .apply(lambda x: int(round(x) / 60))
                                             )
    middle_st_runtimes.drop(columns=['mean_runtime_sec'], inplace = True)

    shape_frequency = (middle_st
                       >> count(_.calitp_itp_id, _.route_id,
                            _.shape_id, _.departure_hour, _.day_name, sort = True)
                      )
    shape_frequency = shape_frequency >> rename(trips_per_hour = 'n')
    shape_frequency = shape_frequency >> inner_join(_, middle_st_runtimes, on = [
        'calitp_itp_id', 'day_name', 'shape_id', 'departure_hour', 'route_id']) 
    
    ## insert nulls or 0 for missing values as appropriate
    shapes_routes = shape_frequency[['shape_id', 'route_id']].set_index('shape_id').to_dict()['route_id']
    iterables = [shape_frequency.shape_id.unique(), ['Thursday', 'Saturday', 'Sunday'], range(0, 24)]
    multi_ix = pd.MultiIndex.from_product(iterables, names=['shape_id', 'day_name', 'departure_hour'])
    shape_frequency = shape_frequency.set_index(['shape_id', 'day_name', 'departure_hour'])
    try:
        shape_frequency = shape_frequency.reindex(multi_ix).reset_index()
    except ValueError: ## aggregate rare shape_ids with multiple routes (these may be errors)
        duplicate_indicies = shape_frequency.index[shape_frequency.index.duplicated()]
        print(f'''
        caution, operator has {len(duplicate_indicies)} duplicate shape/day/departure_hour \
out of an ix of {len(shape_frequency.index)}
        ''')
        assert len(duplicate_indicies) < len(shape_frequency.index) / 100, 'too many duplicate shape/day/departure_hour'
        shape_frequency = shape_frequency.groupby(['shape_id', 'day_name', 'departure_hour']).agg(
            {'calitp_itp_id':max, 'route_id':max, 'trips_per_hour':sum, 'mean_runtime_min':np.mean})
        shape_frequency = shape_frequency.reindex(multi_ix).reset_index()

        # iterables.append(shape_frequency.route_id.unique())
        # multi_ix = pd.MultiIndex.from_product(iterables,
        #                                       names=['shape_id', 'day_name', 'departure_hour', 'route_id'])
        # shape_frequency = (shape_frequency
        #                    .set_index('route_id', append=True)
        #                    .reindex(multi_ix)
        #                    .reset_index()
        #                   )

    shape_frequency['calitp_itp_id'] = (shape_frequency['calitp_itp_id']
                                        .fillna(method='bfill')
                                        .fillna(method='ffill')
                                       )
    shape_frequency['trips_per_hour'] = shape_frequency['trips_per_hour'].fillna(0)
    shape_frequency['route_id'] = shape_frequency['shape_id'].apply(lambda x: shapes_routes[x])
    
    assert shape_frequency['trips_per_hour'].max() < 30, 'a route claims to run 30 times per hour or more!'
    
    print(f' df shape for operator {itp_id}: {shape_frequency.shape}')
    return shape_frequency.astype({'calitp_itp_id': 'int64', 'trips_per_hour': 'int64'})

In [9]:
itp_ids = tbl.views.gtfs_agency_names() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [10]:
def multiple_operator_shape_frequency():
    
    shape_frequency = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}...', end='')
            shape_frequency = shape_frequency.append(single_operator_shape_frequency(itp_id))
            # print('done!')
        except AssertionError as err:
            print(f'failed for operator {itp_id}')
            print(err)
        
    return shape_frequency

In [11]:
all_operators_shape_frequency = multiple_operator_shape_frequency()

attempting for operator 256...failed for operator 256
zero records in filtered trips fact table
attempting for operator 257... df shape for operator 257: (144, 7)
attempting for operator 259... df shape for operator 259: (2016, 7)
attempting for operator 4... df shape for operator 4: (23760, 7)
attempting for operator 260... df shape for operator 260: (360, 7)
attempting for operator 261... df shape for operator 261: (576, 7)
attempting for operator 6...failed for operator 6
zero stop times for middle stop, may be non-consecutive
attempting for operator 263... df shape for operator 263: (504, 7)
attempting for operator 264... df shape for operator 264: (216, 7)
attempting for operator 265... df shape for operator 265: (72, 7)
attempting for operator 10... df shape for operator 10: (144, 7)
attempting for operator 11... df shape for operator 11: (1368, 7)
attempting for operator 269... df shape for operator 269: (9720, 7)
attempting for operator 14... df shape for operator 14: (216, 7)


In [12]:
all_operators_shape_frequency.to_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")



In [2]:
all_operators_shape_frequency = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency.parquet")
all_operators_shape_frequency.head(3)



Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
0,13737,Thursday,0,257,66,0,
1,13737,Thursday,1,257,66,0,
2,13737,Thursday,2,257,66,0,


### Joined to funding source?

In [36]:
with_funding = (tbl.views.transitstacks()
                  >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, _.transit_provider, _._5307_funds, _._5311_funds,
                           _.operating_expenses_total_2019)
                  >> collect()
                  >> right_join(_, all_operators_shape_frequency, on = 'calitp_itp_id')
               )

In [37]:
# with_funding._5307_funds.unique()

In [38]:
int('$10,546,704'.replace('$', '').replace(',', ''))

10546704

In [39]:
def fix_funds(value):
    if type(value) != str:
        return None
    else:
        return int(value.replace('$', '').replace(',', ''))

In [40]:
with_funding._5307_funds = with_funding._5307_funds.apply(fix_funds)

In [41]:
with_funding._5311_funds = with_funding._5311_funds.apply(fix_funds)

In [45]:
with_funding.operating_expenses_total_2019 = with_funding.operating_expenses_total_2019.apply(fix_funds)

In [47]:
with_funding.to_parquet(f"{GCS_FILE_PATH}shape_frequency_funding.parquet")

### Spot Checking

In [34]:
(tbl.views.gtfs_schedule_dim_routes()
 >> filter(_.calitp_itp_id == 300)
 >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
 >> filter(_.route_id == '3328')
)

Unnamed: 0,route_key,calitp_itp_id,calitp_url_number,route_id,route_type,agency_id,route_short_name,route_long_name,route_desc,route_url,...,route_continuous_drop_off,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_deleted_at
0,5298754852876447831,300,0,3328,3,6216179,1,Main St & Santa Monica Blvd/UCLA,,http://bigbluebus.com/Routes-and-Schedules/Rou...,...,,Big Blue Bus,http://www.bigbluebus.com,America/Los_Angeles,en,310-451-5444,,,2021-07-27,2099-01-01


### Single Operator Test

In [10]:
itp_id = 279

In [52]:
single = single_operator_shape_frequency(itp_id)


        caution, operator has 1 duplicate shape/day/departure_hour        out of an ix of 494
        
 df shape for operator 279: (2016, 7)


In [53]:
(single >> filter(_.shape_id == '964395_shp')).dropna()

Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
797,964395_shp,Thursday,5,279,4,4,24.5
822,964395_shp,Saturday,6,279,4,1,24.0
848,964395_shp,Sunday,8,279,4,1,24.0


In [34]:
single.dropna()

Unnamed: 0,shape_id,day_name,departure_hour,route_id,calitp_itp_id,trips_per_hour,mean_runtime_min
0,964593_shp,Thursday,0,4,279,1,82.0
10,964593_shp,Thursday,1,4,279,1,83.0
50,964593_shp,Thursday,5,4,279,2,82.0
60,964593_shp,Thursday,6,4,279,4,82.0
70,964593_shp,Thursday,7,4,279,4,82.0
...,...,...,...,...,...,...,...
17833,964511_shp,Sunday,7,3,279,1,57.0
18053,964510_shp,Thursday,5,3,279,1,62.0
19022,981213_shp,Saturday,6,2,279,1,71.0
19032,981213_shp,Saturday,7,2,279,1,71.0


In [21]:
ix = pd.IndexSlice

In [47]:
shape_frequency.loc[ix['964395_shp', 'Thursday', 5]]

  shape_frequency.loc[ix['964395_shp', 'Thursday', 5]]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
shape_id,day_name,departure_hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
964395_shp,Thursday,5,279,4,3,25
964395_shp,Thursday,5,279,6,1,24


In [48]:
shape_frequency.index[shape_frequency.index.duplicated()]

MultiIndex([('964395_shp', 'Thursday', 5)],
           names=['shape_id', 'day_name', 'departure_hour'])

In [50]:
shape_frequency.groupby(['shape_id', 'day_name', 'departure_hour']).agg(
    {'calitp_itp_id':max, 'route_id':max, 'trips_per_hour':sum, 'mean_runtime_min':np.mean})

calitp_itp_id        279
route_id               6
trips_per_hour         4
mean_runtime_min    24.5
Name: (964395_shp, Thursday, 5), dtype: object

### Sandbox