In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd



initial thoughts:

* For long trips (LA->San Jose, etc) 1 bus in the fleet per daily trip could be a reasonable assumption
* Medium trips (Stockton --> Sac, LA-->SD); .5 buses in fleet/trip?
* Short trips (SF Transbay), .3 buses in fleet/trip?

One validator per bus seems adequate since these are single-door, over-the-road coaches.

### Find stops for routes of interest (this seems to work fine)

In [4]:
amtrak_routes = (tbl.gtfs_schedule.routes()
                >> filter(_.calitp_itp_id == 13)
                >> collect())

In [7]:
ca_corridors = ['Capitol Corridor', 'Pacific Surfliner', 'San Joaquins']

In [11]:
ca_rail_routes = amtrak_routes >> filter(_.route_long_name.isin(ca_corridors))

In [12]:
ca_rail_routes = ca_rail_routes >> select(_.route_id, _.route_long_name)

In [14]:
ca_rail_trips = (tbl.gtfs_schedule.trips()
                >> filter(_.calitp_itp_id == 13)
                >> collect()
                >> inner_join(_, ca_rail_routes, on='route_id')
                )

In [76]:
ca_corridor_stops = (tbl.gtfs_schedule.stop_times()
                >> filter(_.calitp_itp_id == 13)
                >> filter(_.trip_id.isin(ca_rail_trips.trip_id))
                >> collect()
                >> distinct(_.stop_id, _keep_all = True)
                )

In [78]:
ca_corridor_stops.head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,trip_id,stop_id,stop_sequence,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,continuous_pickup,continuous_drop_off,shape_dist_traveled,timepoint,calitp_extracted_at
0,13,0,7122816227,OKJ,1,12:36:00,12:36:00,,0,0,,,,,2021-10-19
1,13,0,5212808834,SAC,1,7:10:00,7:10:00,,0,0,,,,,2021-10-19
2,13,0,5422808684,SJC,1,19:05:00,19:05:00,,0,0,,,,,2021-10-19


In [85]:
ca_thruway_trips = (tbl.gtfs_schedule.routes()
                >> filter(_.calitp_itp_id == 13)
                >> filter(_.route_long_name == 'Amtrak Thruway Connecting Service')
                >> inner_join(_, tbl.gtfs_schedule.trips(), on='route_id')
                >> inner_join(_, tbl.gtfs_schedule.stop_times(), on='trip_id')
                >> filter(_.stop_id.isin(ca_corridor_stops.stop_id)) ## filter to trips connecting with CCJPA, LOSSAN and SJRRA
                >> collect()
                >> distinct(_.trip_id, _keep_all=True)
                >> select(_.calitp_itp_id, _.trip_id, _.route_id, _.service_id))

In [86]:
ca_thruway_trips

Unnamed: 0,calitp_itp_id,trip_id,route_id,service_id
0,13,34742814025,37329,2814025
1,13,54102815351,37329,2815351
2,13,54122815340,37329,2815340
3,13,63112818580,37329,2818580
4,13,34742814026,37329,2814026
...,...,...,...,...
858,13,50112817980,37329,2817980
859,13,50052817582,37329,2817582
860,13,60142814221,37329,2814221
861,13,66792814237,37329,2814237


In [87]:
amtrak_thursday_service = (
    tbl.views.gtfs_schedule_fact_daily_service()
    >> filter(_.calitp_itp_id == 13)
    >> filter(_.service_date == '2021-10-28') ## a Thursday
    >> collect())

In [88]:
thursday_thruway_trips = ca_thruway_trips >> filter(_.service_id.isin(amtrak_thursday_service.service_id))

In [91]:
thursday_thruway_trips.head(10)

Unnamed: 0,calitp_itp_id,trip_id,route_id,service_id
7,13,49842815639,37329,2815639
8,13,54102815349,37329,2815349
9,13,66472814258,37329,2814258
10,13,49682815633,37329,2815633
21,13,65132819229,37329,2819229
22,13,65172817654,37329,2817654
26,13,40152820008,42895,2820008
27,13,49672819218,37329,2819218
35,13,63112816851,37329,2816851
38,13,54162815345,37329,2815345


In [92]:

thursday_thruway_trips['route_id'].value_counts()

37329    380
11317      8
42896      6
42895      4
42917      2
Name: route_id, dtype: int64

#### 380 daily trips for route_id 37329 seems like an error. Also that ID seems to serve many routes?

In [103]:
thursday_thruway_stop_times = (tbl.gtfs_schedule.stop_times()
                               >> filter(_.calitp_itp_id == 13)
                               >> filter(_.trip_id.isin(thursday_thruway_trips.trip_id))
                               >> collect()
                               >> inner_join(_, thursday_thruway_trips, on='trip_id'))

In [113]:
thursday_thruway_stop_times >> count(_.route_id, _.stop_id, sort=True)

## Seems to be way off

Unnamed: 0,route_id,stop_id,n
0,37329,SJC,274
1,37329,SKN,199
2,37329,FRT,182
3,37329,TRA,182
4,37329,DBP,182
...,...,...,...
134,37329,SIM,1
135,37329,CPN,1
136,37329,CML,1
137,37329,SNC,1
