# PMAC presentation

**PO-1: increase total amount of service on the SHN and reliability of that service by 2024**

Typical weekday: 2/8/22

1. Routes on SHN
a. parallel routes (1 mi corridor) - how many routes, agencies, share of all CA transit agency-routes?
b. intersecting routes (everything not parallel) - how many routes, agencies, share of all CA transit agency-routes?
c. intersecting routes (not parallel, but actually travel on SHN (50 ft buffer) for some portion of that route) - subset of above, how many routes, agencies, share of all CA agency-routes?

2. How many routes on SHN, breakdown by district
3. How many service hours are scheduled for a typical weekday for (1)?
4. How many of these agencies that have parallel routes on SHN also have GTFS RT?
Use `isin` and find `itp_id`, not route-specific, because most agencies that provide GTFS RT do it for the majority of their routes.

In [1]:
#https://stackoverflow.com/questions/55162077/how-to-get-the-driving-distance-between-two-geographical-coordinates-using-pytho
import altair as alt
import geopandas as gpd
import os
import pandas as pd

from calitp.tables import tbl
from siuba import *

import D1_pmac_routes
import utils
from shared_utils import geography_utils, gtfs_utils, styleguide
from shared_utils import calitp_color_palette as cp 

TRAFFIC_OPS_GCS = D1_pmac_routes.GCS_FILE_PATH
date_str = D1_pmac_routes.date_str



In [2]:
'''
# Run a query that aggregates service hours
# Note that we're going to be missing some ITP_IDS
# A smattering across various IDs

trip_cols = ["calitp_itp_id", "calitp_url_number", 
             "route_id", "shape_id"]

trips_with_hrs = (tbl.views.gtfs_schedule_fact_daily_trips()
         >> filter(_.service_date == date_str, 
                   _.is_in_service==True, 
                   _.calitp_itp_id != 200)
         >> select(_.trip_key, _.service_date, _.service_hours)
         >> inner_join(_, 
                       tbl.views.gtfs_schedule_dim_trips()
                       >> select(*trip_cols, _.trip_key)
                       >> distinct(), 
                       on = "trip_key")
         >> group_by(_.calitp_itp_id, _.calitp_url_number, _.shape_id, _.route_id)
         >> mutate(total_service_hours = _.service_hours.sum())
         >> select(*trip_cols, _.total_service_hours)
         >> distinct()
         >> collect()
        )

trips_with_hrs.to_parquet(f"{utils.DATA_PATH}trips_with_hrs.parquet")
'''

'\n# Run a query that aggregates service hours\n# Note that we\'re going to be missing some ITP_IDS\n# A smattering across various IDs\n\ntrip_cols = ["calitp_itp_id", "calitp_url_number", \n             "route_id", "shape_id"]\n\ntrips_with_hrs = (tbl.views.gtfs_schedule_fact_daily_trips()\n         >> filter(_.service_date == date_str, \n                   _.is_in_service==True, \n                   _.calitp_itp_id != 200)\n         >> select(_.trip_key, _.service_date, _.service_hours)\n         >> inner_join(_, \n                       tbl.views.gtfs_schedule_dim_trips()\n                       >> select(*trip_cols, _.trip_key)\n                       >> distinct(), \n                       on = "trip_key")\n         >> group_by(_.calitp_itp_id, _.calitp_url_number, _.shape_id, _.route_id)\n         >> mutate(total_service_hours = _.service_hours.sum())\n         >> select(*trip_cols, _.total_service_hours)\n         >> distinct()\n         >> collect()\n        )\n\ntrips_with_hrs

In [3]:
trips_with_hrs = pd.read_parquet(f"{utils.DATA_PATH}trips_with_hrs.parquet")
trips = pd.read_parquet(f"{TRAFFIC_OPS_GCS}trips_{date_str}.parquet")

In [4]:
trips_full_info = pd.merge(
    trips,
    trips_with_hrs,
    on = ["calitp_itp_id", "calitp_url_number", "route_id", "shape_id"], 
    how = "outer",
    validate = "m:1",
    indicator=True
)

In [5]:
trips_full_info._merge.value_counts()

both          103643
right_only       448
left_only          0
Name: _merge, dtype: int64

In [6]:
trips_full_info[trips_full_info._merge=="right_only"].calitp_itp_id.value_counts()

4      126
294     83
314     35
105     34
194     32
282     21
368     21
110     17
310     17
235     15
279     11
246      9
280      7
264      6
218      5
106      4
127      3
10       1
356      1
Name: calitp_itp_id, dtype: int64

In [7]:
df = geography_utils.aggregate_by_geography(
    trips_full_info[trips_full_info._merge=="both"],
    group_cols = ["calitp_itp_id", "route_id"],
    # this is a choice to sum up service hours if they have same route_id
    # but different shape_id
    sum_cols = ["total_service_hours"]
).rename(columns = {"calitp_itp_id": "itp_id"})

In [8]:
def flag_parallel_intersecting_routes(df, date_str):
    # If it is parallel, we want to flag as 1
    parallel = gpd.read_parquet(
        f"{TRAFFIC_OPS_GCS}parallel_or_intersecting_{date_str}.parquet")
    
    # Attach district info for routes, wherever possible
    df = pd.merge(df,
                  (parallel[["itp_id", "route_id", "District"]]
                   .astype({"District": "Int64"})
                  .drop_duplicates(subset=["itp_id", "route_id"])),
                  on = ["itp_id", "route_id"],
                  how = "left",
                  validate = "1:1"
                 )
    
    parallel = parallel[parallel.parallel==1]
    
    # These are routes that have some part on SHN
    # BUT, there is overlap between the parallel
    # Since the requirements here are less stringent than parallel
    # So, remove those that are already parallel
    on_shn = gpd.read_parquet(
        f"{TRAFFIC_OPS_GCS}routes_on_shn_{date_str}.parquet"
    )
    
    # Get it down to unique route (instead of with highway info)
    def get_unique_routes(df):
        # If there are multiple shape_ids for route_id,
        # Keep the one where it's has higher overlap with SHN
        df = (df.sort_values(["itp_id", "route_id", "pct_route", "shape_id"],
                             ascending=[True, True, False, True],
                            )
                    .drop_duplicates(subset=["itp_id", "route_id"])
                    [["itp_id", "route_id", "parallel"]]
                    .reset_index(drop=True)
                   )
        return df
    
    parallel2 = get_unique_routes(parallel)
    on_shn2 = get_unique_routes(on_shn)
    on_shn2 = on_shn2.assign(
        on_shn = 1,
    ).drop(columns = "parallel")
    
    m1 = pd.merge(df, 
                  parallel2,
                  on = ["itp_id", "route_id"],
                  how = "left",
                  validate = "1:1"
    )
    
    m2 = pd.merge(m1,
                  on_shn2,
                  on = ["itp_id", "route_id"],
                  how = "left",
                  validate = "1:1"
    )

    return m2

In [9]:
def mutually_exclusive_groups(df):
    # Now, force mutual exclusivity
    def make_mutually_exclusive(row):
        if row.parallel==1:
            return "parallel"
        elif row.on_shn==1:
            return "on_shn"
        else:
            return "other"
    
    df["category"] = df.apply(lambda x: make_mutually_exclusive(x), axis=1)
    
    df2 = df.assign(
        is_parallel = df.apply(lambda x: 
                               1 if x.category == "parallel" else 0, axis=1),
        is_on_shn = df.apply(lambda x: 
                             1 if x.category == "on_shn" else 0, axis=1),
        is_other = df.apply(lambda x: 
                            1 if x.category == "other" else 0, axis=1),
        # Create a column that can be summed across categories
        # since group_col will include category, and nunique(route_id) double-counts
        unique_route = 1
    ).drop(columns = ["parallel", "on_shn"])
    
    return df2

In [10]:
df = flag_parallel_intersecting_routes(df, date_str)
df = mutually_exclusive_groups(df)

In [11]:
# I think this is the initial table
# Get it summarized to # and %
summary = geography_utils.aggregate_by_geography(
    df, 
    group_cols = ["category"],
    sum_cols = ["total_service_hours", "unique_route"],
)

In [12]:
# Add percents
def add_percent(df, col_list):
    for c in col_list:
        new_col = f"pct_{c}"
        df[new_col] = (df[c] / df[c].sum()).round(3) * 100
        df[c] = df[c].round(0)
        
    return df

# route_id....well, unique route_id is not exactlyy the same as is_parallel,
# is_on_shn, is_other
# Maybe stick with that to be consistent
#add_percent(t1, "route_id")

In [13]:
summary = add_percent(summary, ["total_service_hours", "unique_route"])

col_order = ['category', 'unique_route', 'pct_unique_route',
             'total_service_hours', 'pct_total_service_hours'
            ]

summary[col_order]

Unnamed: 0,category,unique_route,pct_unique_route,total_service_hours,pct_total_service_hours
0,parallel,1595,54.1,2034298.0,49.8
1,on_shn,1116,37.8,1531437.0,37.5
2,other,238,8.1,517520.0,12.7


In [14]:
geography_utils.aggregate_by_geography(
    summary.assign(category="All"),
    group_cols = ["category"],
    sum_cols = ["unique_route", "total_service_hours"]
)

Unnamed: 0,category,total_service_hours,unique_route
0,All,4083255.0,2949


In [15]:
# Where district is missing, it's not parallel routes
# So let's ignore those sections and focus on just parallel and do breakdown
df[(df.District.isna())].category.value_counts()

other     238
on_shn     91
Name: category, dtype: int64

In [16]:
by_district = geography_utils.aggregate_by_geography(
    df[df.category=="parallel"],
    group_cols = ["District"],
    sum_cols = ["total_service_hours", "unique_route"]
)

by_district = (add_percent(by_district, ["total_service_hours", "unique_route"])
               .sort_values("District")
              )

for c in ["pct_total_service_hours", "pct_unique_route"]:
    by_district[c] = by_district[c].round(1)
    
by_district

Unnamed: 0,District,total_service_hours,unique_route,pct_total_service_hours,pct_unique_route
4,1,3319.0,52,0.2,3.3
7,2,1939.0,31,0.1,1.9
3,3,44505.0,103,2.2,6.5
0,4,728703.0,515,35.8,32.3
9,5,18669.0,68,0.9,4.3
6,6,43763.0,90,2.2,5.6
1,7,917716.0,477,45.1,29.9
5,8,2779.0,34,0.1,2.1
8,9,3311.0,17,0.2,1.1
2,10,26002.0,94,1.3,5.9


In [17]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Raleway');
@import url('https://fonts.googleapis.com/css?family=Nunito+Sans');
@import url('https://fonts.googleapis.com/css?family=Bitter');
</style>

In [18]:
def base_bar(df):
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X("District:N", title="District")
             )
            )
    return chart

def make_bar(df, y_col):
    
    y_title = f"{y_col.replace('_', ' ').title()}"
    
    if y_col == "total_service_hours":
        value_format = ",.0f"
    else:
        value_format = ",.1f"
    
    Y_MAX = df[y_col].max() + 30
    
    bar = base_bar(df)
    
    bar = (bar.encode(
        y=alt.Y(f"{y_col}:Q", title=f"{y_title}", 
                scale=alt.Scale(domain=[0, Y_MAX])
               ),
        color=alt.Color("District:N", 
                        scale=alt.Scale(
                            range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        )
                )
             )
            )
    #https://stackoverflow.com/questions/54015250/altair-setting-constant-label-color-for-bar-chart
    text = (bar
            .mark_text(align="center", baseline="bottom",
                       color="black", dy=-5  
                      )
            .encode(text=alt.Text(y_col, format=value_format), 
                    # Set color here, because encoding for mark_text gets 
                    # superseded by alt.Color
                   color=alt.value("black"))
    )
      
    chart = (bar+text)
    
    chart = (styleguide.preset_chart_config(chart)
             .properties(title= {
                 "text": f"{y_title} by District",
                 "subtitle": "Parallel Routes"
            }).configure_axis(grid=False)
            )
    
    chart.save(f"{utils.IMG_PATH}pmac_{y_col}.png")
    
    display(chart)

In [19]:
by_district = by_district.assign(
    avg_service_hours = by_district.total_service_hours.divide(
        by_district.unique_route).round(1)
)

In [20]:
metrics = [
    "total_service_hours", 
    "avg_service_hours"
]

for m in metrics:
    make_bar(by_district, m)