In [4]:
#!pip install geopandas

In [5]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd

initial thoughts:

* For long trips (LA->San Jose, etc) 1 bus in the fleet per daily trip could be a reasonable assumption
* Medium trips (Stockton --> Sac, LA-->SD); .5 buses in fleet/trip?
* Short trips (SF Transbay), .3 buses in fleet/trip?

One validator per bus seems adequate since these are single-door, over-the-road coaches.

### Find stops for routes of interest (this seems to work fine)

In [6]:
amtrak_routes = (tbl.gtfs_schedule.routes()
                >> filter(_.calitp_itp_id == 13)
                >> collect())

In [7]:
ca_corridors = ['Capitol Corridor', 'Pacific Surfliner', 'San Joaquins']

In [8]:
ca_rail_routes = amtrak_routes >> filter(_.route_long_name.isin(ca_corridors))

In [9]:
ca_rail_routes = ca_rail_routes >> select(_.route_id, _.route_long_name)

In [10]:
ca_rail_trips = (tbl.gtfs_schedule.trips()
                >> filter(_.calitp_itp_id == 13)
                >> collect()
                >> inner_join(_, ca_rail_routes, on='route_id')
                )

In [11]:
ca_corridor_stops = (tbl.gtfs_schedule.stop_times()
                >> filter(_.calitp_itp_id == 13)
                >> filter(_.trip_id.isin(ca_rail_trips.trip_id))
                >> collect()
                >> distinct(_.stop_id, _keep_all = True)
                )

In [12]:
ca_corridor_stops.head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,trip_id,stop_id,stop_sequence,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,continuous_pickup,continuous_drop_off,shape_dist_traveled,timepoint,calitp_extracted_at
0,13,0,5672809888,SAN,1,11:25:00,11:25:00,,0,0,,,,,2021-10-19
1,13,0,5422816135,SJC,1,19:05:00,19:05:00,,0,0,,,,,2021-10-19
2,13,0,7492819537,SAC,1,22:38:00,22:38:00,,0,0,,,,,2021-10-19


In [13]:
ca_thruway_trips = (tbl.gtfs_schedule.routes()
                >> filter(_.calitp_itp_id == 13)
                >> filter(_.route_long_name == 'Amtrak Thruway Connecting Service')
                >> inner_join(_, tbl.gtfs_schedule.trips(), on='route_id')
                >> inner_join(_, tbl.gtfs_schedule.stop_times(), on='trip_id')
                >> filter(_.stop_id.isin(ca_corridor_stops.stop_id)) ## filter to trips connecting with CCJPA, LOSSAN and SJRRA
                >> collect()
                >> distinct(_.trip_id, _keep_all=True)
                >> select(_.calitp_itp_id, _.trip_id, _.route_id, _.service_id))

In [14]:
ca_thruway_trips

Unnamed: 0,calitp_itp_id,trip_id,route_id,service_id
0,13,66112814305,37329,2814305
1,13,37012814288,37329,2814288
2,13,63112816851,37329,2816851
3,13,65152817681,37329,2817681
4,13,33212814252,37329,2814252
...,...,...,...,...
858,13,66792814237,37329,2814237
859,13,33112814232,37329,2814232
860,13,50112817980,37329,2817980
861,13,66312814218,37329,2814218


In [15]:
amtrak_thursday_service = (
    tbl.views.gtfs_schedule_fact_daily_service()
    >> filter(_.calitp_itp_id == 13)
    >> filter(_.service_date == '2021-10-28') ## a Thursday
    >> collect())

In [16]:
thursday_thruway_trips = ca_thruway_trips >> filter(_.service_id.isin(amtrak_thursday_service.service_id))

In [32]:
thursday_thruway_trips.shape

(400, 4)

In [17]:
thursday_thruway_trips.head(10)

Unnamed: 0,calitp_itp_id,trip_id,route_id,service_id
0,13,66112814305,37329,2814305
2,13,63112816851,37329,2816851
3,13,65152817681,37329,2817681
9,13,49682815633,37329,2815633
14,13,40152820008,42895,2820008
20,13,65172817654,37329,2817654
24,13,66472814258,37329,2814258
28,13,49852819216,37329,2819216
29,13,49672819218,37329,2819218
32,13,54122815339,37329,2815339


In [18]:
thursday_thruway_trips['route_id'].value_counts()

37329    380
11317      8
42896      6
42895      4
42917      2
Name: route_id, dtype: int64

#### 380 daily trips for route_id 37329 seems like an error. Also that ID seems to serve many routes?

In [19]:
thursday_thruway_stop_times = (tbl.gtfs_schedule.stop_times()
                               >> filter(_.calitp_itp_id == 13)
                               >> filter(_.trip_id.isin(thursday_thruway_trips.trip_id))
                               >> collect()
                               >> inner_join(_, thursday_thruway_trips, on='trip_id'))

In [20]:
thursday_thruway_stop_times >> count(_.route_id, _.stop_id, sort=True)

## Seems to be way off

Unnamed: 0,route_id,stop_id,n
0,37329,SJC,274
1,37329,SKN,199
2,37329,FRT,182
3,37329,TRA,182
4,37329,DBP,182
...,...,...,...
134,37329,SIM,1
135,37329,CPN,1
136,37329,CML,1
137,37329,SNC,1


### Sandbox

In [23]:
(tbl.gtfs_schedule.trips()
                >> filter(_.calitp_itp_id == 13)
                >> count(_.route_id))

Unnamed: 0,route_id,n
0,37329,917
1,88,360
2,71,218
3,40751,199
4,84,169


In [27]:
route_37329 = (tbl.gtfs_schedule.routes()
                >> filter(_.calitp_itp_id == 13)
                >> filter(_.route_long_name == 'Amtrak Thruway Connecting Service')
                >> collect())

In [29]:
(tbl.gtfs_schedule.shapes()
                >> filter(_.calitp_itp_id == 13))

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,calitp_extracted_at
0,13,0,3,44.693464,-73.444075,211,,2021-10-19
1,13,0,3,42.896576,-73.884067,1374,,2021-10-19
2,13,0,3,42.797724,-73.957436,1432,,2021-10-19
3,13,0,4,42.267104,-73.786927,1051,,2021-10-19
4,13,0,4,42.667379,-73.751665,1184,,2021-10-19


In [37]:
thursday_thruway_trips = ca_thruway_trips >> filter(_.service_id.isin(['2814305']))

thursday_thruway_trips

Unnamed: 0,calitp_itp_id,trip_id,route_id,service_id
0,13,66112814305,37329,2814305
