In [1]:
%run 0_data_model.ipynb

pk_str = ["calitp_itp_id", "calitp_url_number"]
pk_col = (_.calitp_itp_id, _.calitp_url_number)

DATE_START = "2021-04-01"
DATE_END = "2021-05-01"

## Table overview

* agency_trips
* stops_and_times
* schedule_daily

Tables used for questions:

* **common route types** - agency_trips -> distinct routes (or gtfs_schedule_routes table)
* **routes in service** - agency_trips + schedule_daily
* **stops per route** - agency_trips + stops_and_times
* **stops per route in service** - everything above + schedule_dailly

In [2]:
tbl_agency_trips

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,service_id,trip_id,shape_id,trip_headsign,trip_short_name,direction_id,block_id,wheelchair_accessible,bikes_allowed,calitp_extracted_at,route_desc,route_sort_order,route_color,continuous_drop_off,route_url,continuous_pickup,route_short_name,route_long_name,agency_id,route_text_color,route_type,agency_lang,agency_name,agency_fare_url,agency_email,agency_phone,agency_url,agency_timezone
0,3,0,574,0,30-Qr1RhyMHGk2G,902,Clockwise,,0,194098,,,2021-04-16,King East Clockwise,,4B63AD,,,,King East,DASH King East Clockwise,30,FFFFFF,3,en,LADOT,https://store.ladottransit.com/,,213-808-2273,https://www.ladottransit.com/,America/Los_Angeles
1,183,0,574,0,30-LoYj7FMNmm3Y,902,Clockwise,,0,194082,,,2021-04-16,King East Clockwise,,4B63AD,,,,King East,DASH King East Clockwise,30,FFFFFF,3,en,LADOT,https://store.ladottransit.com/,,213-808-2273,https://www.ladottransit.com/,America/Los_Angeles
2,3,0,574,0,30-LoYj7FMNmm3Y,902,Clockwise,,0,194082,,,2021-04-16,King East Clockwise,,4B63AD,,,,King East,DASH King East Clockwise,30,FFFFFF,3,en,LADOT,https://store.ladottransit.com/,,213-808-2273,https://www.ladottransit.com/,America/Los_Angeles
3,183,0,574,0,30-IXNJSVmHMskp,902,Clockwise,,0,194098,,,2021-04-16,King East Clockwise,,4B63AD,,,,King East,DASH King East Clockwise,30,FFFFFF,3,en,LADOT,https://store.ladottransit.com/,,213-808-2273,https://www.ladottransit.com/,America/Los_Angeles
4,183,0,574,0,30-oA34l2Db98G2,902,Clockwise,,0,194098,,,2021-04-16,King East Clockwise,,4B63AD,,,,King East,DASH King East Clockwise,30,FFFFFF,3,en,LADOT,https://store.ladottransit.com/,,213-808-2273,https://www.ladottransit.com/,America/Los_Angeles


## Which route type is most common?

In [3]:
top_cases = {
    "0": "Tram, Streetcar, Light rail",
    "1": "Subway, Metro",
    "2": "Rail",
    "3": "Bus",
    "4": "Ferry",
    "5": "Cable tram",
}

main_case_expr = {_.route_type == k: top_cases[k] for k in top_cases}

case_expr = case_when(_, {**main_case_expr, True: "Other"})

# note, could also use gtfs_schedule_routes directly
(
    tbl.gtfs_schedule_routes()
    >> mutate(route_type=case_expr)
    >> count(_.route_type)
)

Unnamed: 0,route_type,n
0,Bus,2789
1,Ferry,29
2,"Tram, Streetcar, Light rail",18
3,"Subway, Metro",16
4,Rail,15


## Routes in service on a specific day

In [4]:
recent_schedule = (
    tbl_schedule_daily
    >> filter(_.service_date.between(DATE_START, DATE_END))
    >> select(_.service_date, _.service_id, *pk_col)
)

In [5]:
recent_trips = (
    tbl_agency_trips
    >> left_join(
        _, recent_schedule, ["calitp_itp_id", "calitp_url_number", "service_id"],
    )
    >> group_by(*pk_col, _.route_id)
    >> summarize(
        last_trip=_.service_date.max(),
        first_trip=_.service_date.min(),
        n_trips=_.trip_id.count(),
    )
    #>> left_join(_, tbl_agency_routes, [*pk_str, "route_id"])
)

In [6]:
recent_trips >> count(_.last_trip)

Unnamed: 0,last_trip,n
0,2021-05-01,2217
1,,429
2,2021-04-30,56
3,2021-04-15,3
4,2021-04-28,1


## Number of stops per route

`stops_and_times` - one row per stop time, with stop information joined in.


In [7]:
tbl_stops_and_times >> count()

Unnamed: 0,n
0,9579025


In [8]:
trip_stop_times = (
    tbl_stops_and_times
    >> inner_join(
        _,
        tbl_agency_trips >> select(_.trip_id, _.service_id, _.route_id, *pk_col),
        [*pk_str, "trip_id"],
    )
)

In [9]:
calc_route_metrics = group_by(*pk_col, _.route_id) >> summarize(
    n_unique_trips=_.trip_id.nunique(),
    n_unique_stops=_.stop_id.nunique(),
    n_stop_times=n(_),
    first_departure=_.departure_time.min(),
    last_arrival=_.arrival_time.max()
)

all_route_metrics = (trip_stop_times >> calc_route_metrics)

In [10]:
active_routes = (
    trip_stop_times
    >> inner_join(
        _,
        recent_schedule >> filter(_.service_date == DATE_END),
        [*pk_str, "service_id"],
    )
)

active_route_metrics = (active_routes
    >> calc_route_metrics
)

In [11]:
all_route_metrics >> count()

Unnamed: 0,n
0,2708


In [12]:
active_route_metrics >> count()

Unnamed: 0,n
0,2217


### Which route has the most stops locations?

In [13]:
most_stops = active_route_metrics >> filter(_.n_unique_stops == _.n_unique_stops.max())

most_stops

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,n_unique_trips,n_unique_stops,n_stop_times,first_departure,last_arrival
0,182,0,267-13139,165,287,12192,05:30:00,20:07:00


In [14]:
# however, note that this is not necessarily the route with the trip
# that stops the most times
most_stops

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,n_unique_trips,n_unique_stops,n_stop_times,first_departure,last_arrival
0,182,0,267-13139,165,287,12192,05:30:00,20:07:00


### Which route has the trip with most stop times?

In [15]:
n_trip_stops = (
    trip_stop_times
    >> distinct(*pk_col, _.route_id, _.trip_id, _.stop_id)
    >> count(*pk_col, _.route_id, _.trip_id)
    >> group_by(*pk_col, _.route_id)
    >> summarize(n_max_trip_stops=_.n.max(), n_min_trip_stops=_.n.min())
)

n_trip_stop_times = (
    trip_stop_times
    >> count(*pk_col, _.route_id, _.trip_id)
    >> group_by(*pk_col, _.route_id)
    >> summarize(n_max_trip_stop_times=_.n.max(), n_min_trip_stop_times=_.n.min(),)
)

(
    active_route_metrics
    >> inner_join(_, n_trip_stops, [*pk_str, "route_id"])
    >> inner_join(_, n_trip_stop_times, [*pk_str, "route_id"])
    >> filter(_.n_max_trip_stop_times == _.n_max_trip_stop_times.max())
)

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,n_unique_trips,n_unique_stops,n_stop_times,first_departure,last_arrival,n_max_trip_stops,n_min_trip_stops,n_max_trip_stop_times,n_min_trip_stop_times
0,182,0,90-13139,177,282,19118,05:06:00,24:24:00,136,68,136,68


## Do agencies use multiple route entries for the same "route"?

The GTFS schedule defines a route as "a group of trips that are displayed to riders as a single service".

* Dead routes. There are two entries, with the same route long name, but 
* Similar route names, different directions. E.g. Green Line Southbound, Green Line Northbound.
* Duplicate active route names. E.g. Several active routes exist, and they all share the same long_name.

### Similar route names, different directions

In [16]:
route_names = tbl.gtfs_schedule_routes() >> select(*pk_col, _.route_id, _.route_short_name, _.route_long_name)

In [17]:
opposite_routes = (
    active_route_metrics
    >> inner_join(_, active_route_metrics, [*pk_str, "n_unique_trips", "n_stop_times"])
    >> filter(_.route_id_x != _.route_id_y)
    >> rename(route_id="route_id_x")
    >> inner_join(
        _, route_names, [*pk_str, "route_id"]
    )
    >> collect()
)

opposite_routes >> filter(_.calitp_itp_id.isin([2, 3]))

opposite_routes.calitp_itp_id.nunique()

29

In [18]:
opposite_routes >> filter(_.route_long_name.str.lower().str.contains("(north)|(south)|(east)|(west)|(clock)"))

  return func(self, *args, **kwargs)


Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,n_unique_trips,n_unique_stops_x,n_stop_times,first_departure_x,last_arrival_x,route_id_y,first_departure_y,last_arrival_y,n_unique_stops_y,route_short_name,route_long_name
0,2,0,BCT109 SB,50,77,3850,10:00:00,9:59:00,BCT109 NB,10:00:00,9:59:00,77,,Beach Cities Transit 109 Southbound
2,2,0,BCT109 NB,50,77,3850,10:00:00,9:59:00,BCT109 SB,10:00:00,9:59:00,77,,Beach Cities Transit 109 Northbound
10,3,0,572,96,29,2880,06:00:00,19:38:00,573,06:00:00,19:35:00,29,Wilshire Center/Koreatown Counterclockwise,DASH Wilshire Center/Koreatown Counterclockwise
11,3,0,573,96,29,2880,06:00:00,19:35:00,572,06:00:00,19:38:00,29,Crenshaw,DASH Crenshaw Clockwise
14,183,0,572,96,29,2880,06:00:00,19:38:00,573,06:00:00,19:35:00,29,Wilshire Center/Koreatown Counterclockwise,DASH Wilshire Center/Koreatown Counterclockwise
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,8,1,042,16,61,496,08:50:00,20:40:00,045,08:15:00,17:44:00,60,42,East Salinas-Westridge
82,8,1,045,16,60,496,08:15:00,17:44:00,042,08:50:00,20:40:00,61,45,East Market-Creekbridge
83,278,0,201,66,11,792,05:50:00,22:31:00,202,05:45:00,22:23:00,11,201,Super Loop - Counterclockwise
84,278,0,202,66,11,792,05:45:00,22:23:00,201,05:50:00,22:31:00,11,202,Super Loop - Clockwise


### Duplicate active route names

In [19]:
active_route_metrics >> count()

Unnamed: 0,n
0,2217


In [20]:
active_route_names = active_route_metrics >> inner_join(
    _, route_names, [*pk_str, "route_id"]
)

unique_route_names = tbl.gtfs_schedule_routes() >> distinct(_.calitp_itp_id, _.calitp_url_number, _.route_long_name)

duplicate_route_names = (
    active_route_names
    >> inner_join(_, unique_route_names, [*pk_str, "route_long_name"])
)

duplicate_route_names >> count(_.calitp_itp_id, _.calitp_url_number)

Unnamed: 0,calitp_itp_id,calitp_url_number,n
0,182,0,122
1,4,0,113
2,278,0,74
3,3,0,61
4,183,0,61


In [21]:
duplicate_route_names >> filter(_.calitp_itp_id == 182) >> count(_.route_long_name)

Unnamed: 0,route_long_name,n
0,Metro Local Line,101
1,Metro Rapid Line,11
2,Metro Express Line,6
3,Metro Limited Line,1
4,Metro G Line (Orange) 901,1


### Dead routes

In [22]:
full_route_metrics = (
    tbl.gtfs_schedule_routes()
    >> left_join(_, active_route_metrics, [*pk_str, "route_id"])
    >> mutate(n_stop_times=_.n_stop_times.fillna(0))
    >> select(
        _.calitp_itp_id,
        _.calitp_url_number,
        _.route_id,
        _.n_stop_times,
        _.route_long_name,
    )
)

inactive_routes = full_route_metrics >> filter(_.n_stop_times == 0)
active_routes = full_route_metrics >> filter(_.n_stop_times > 0)

dead_routes = inner_join(inactive_routes, active_routes, [*pk_str, "route_long_name"],)

In [23]:
dead_routes >> count()

Unnamed: 0,n
0,41


In [24]:
dead_routes >> count(_.calitp_itp_id)

Unnamed: 0,calitp_itp_id,n
0,232,29
1,142,3
2,235,3
3,314,2
4,70,2


In [25]:
(
    dead_routes
    >> select(_.startswith("calitp"), _.route_long_name, _.contains(""))
    >> arrange(_.calitp_itp_id, _.calitp_url_number)
)

Unnamed: 0,calitp_itp_id,calitp_url_number,route_long_name,route_id_x,n_stop_times_x,route_id_y,n_stop_times_y
0,70,0,"Santa Rosa, Rohnert Park, Cotati, Petaluma",1052,0,1035,1728
1,70,0,"Cloverdale, Healdsburg, Windsor, Santa Rosa",1053,0,1036,2556
2,142,0,Fullerton - Newport Beach,47A,0,47,22886
3,142,0,Tustin - Newport Beach,79A,0,79,4308
4,142,0,Seal Beach - Orange,42A,0,42,13781
