# Categorize `on_shn`, `parallel` (affected by SHN), and `other`

In [None]:
import geopandas as gpd
import pandas as pd

from update_vars import (ANALYSIS_DATE, 
                         BUS_SERVICE_GCS, COMPILED_CACHED_GCS)

## `on_shn`
* Since `on_shn` is the primary category, and it's drawn with a 50 ft buffer around hwy centerline, no longer need to use `pct_highway` (set `pct_highway > 0`)
* Is 25% too high of a threshold? 
* `pct_route` threshold of 20% and 25% both fall within the top 70%-75% of routes
* Settle for at least 20% of route length runs within 50 ft of hwy (on hwy)

In [None]:
df = gpd.read_parquet(f"{BUS_SERVICE_GCS}routes_on_shn_{ANALYSIS_DATE}.parquet")

print(f"# rows (route_id-Route pairs): {len(df)}")
print(f"# route_id: {len(df[['itp_id', 'route_id']].drop_duplicates())}")

In [None]:
unique_routes = (df.sort_values(["itp_id", "route_id", "pct_route"], 
                               ascending=[True, True, False])
                 .drop_duplicates(subset=["itp_id", "route_id"])
                 .reset_index(drop=True)
                )

ptile = []

for i in range(5, 100, 5):
    ptile.append(i/100)

unique_routes.pct_route.describe(percentiles=ptile)

In [None]:
for r in range(20, 35, 5):
    subset = unique_routes[unique_routes.pct_route >= r/100]
        
    print(f"route threshold: {r/100} - {len(subset)}")

In [None]:
twenty = unique_routes[unique_routes.pct_route >= 0.20]
twentyfive = unique_routes[unique_routes.pct_route >= 0.25]

In [None]:
def make_map(gdf: gpd.GeoDataFrame): 
    
    cols = ["itp_id", "route_id", "geometry"]

    m = (gdf[cols].drop_duplicates()
         .explore("itp_id", categorical=True, tiles = "CartoDB Positron")
    )
    
    print(f"route threshold: {gdf.pct_route.min()}")
    display(m)

In [None]:
#make_map(twenty)
#make_map(twentyfive)

In [None]:
itp_id = 182

operator_twenty = twenty[twenty.itp_id==itp_id]
operator_twentyfive = twentyfive[twentyfive.itp_id==itp_id]

difference_routes = list(set(operator_twentyfive.route_id)
     .symmetric_difference(set(operator_twenty.route_id)))

make_map(operator_twenty)
make_map(operator_twentyfive)

print("Routes Included if Threshold is 20%")
make_map(operator_twenty[operator_twenty.route_id.isin(difference_routes)])

In [None]:
itp_id = 4

operator_twenty = twenty[twenty.itp_id==itp_id]
operator_twentyfive = twentyfive[twentyfive.itp_id==itp_id]

difference_routes = list(set(operator_twentyfive.route_id)
     .symmetric_difference(set(operator_twenty.route_id)))

make_map(operator_twenty)
make_map(operator_twentyfive)

print("Routes Included if Threshold is 20%")
make_map(operator_twenty[operator_twenty.route_id.isin(difference_routes)])

## `parallel`

* These are routes that are affected by SHN, where bottlenecks might occur because bus routes have to pass through where there are on-ramps. 
* Use a 0.5 mile buffer from SHN, and see whether threshold should be 30%? 20%? lower? higher?
* It's much more marginal to add a couple more routes in this category, go with 20%, which will grab about 60% of the 260 routes that was tagged as being `parallel` and is not `on_shn`

In [None]:
df2 = gpd.read_parquet(f"{BUS_SERVICE_GCS}parallel_or_intersecting_{ANALYSIS_DATE}.parquet")

print(f"# rows (route_id-Route pairs): {len(df2)}")
print(f"# route_id: {len(df2[['itp_id', 'route_id']].drop_duplicates())}")

In [None]:
unique_routes2 = (df2.sort_values(["itp_id", "route_id", "pct_route"], 
                               ascending=[True, True, False])
                 .drop_duplicates(subset=["itp_id", "route_id"])
                 .reset_index(drop=True)
                )

In [None]:
route_cols = ["itp_id", "route_id"]

on_shn = (unique_routes[unique_routes.pct_route >= 0.2]
          .assign(category="on_shn")
          .rename(columns = {"pct_route": "pct_route_on_hwy"})
          [route_cols + ["pct_route_on_hwy"]]
         )

In [None]:
unique_routes3 = pd.merge(
    on_shn,
    unique_routes2[route_cols + ["pct_route"]],
    on = route_cols,
    how = "outer",
    validate = "1:1",
    indicator=True
)

unique_routes3._merge.value_counts()

In [None]:
ptile = []

for i in range(5, 100, 5):
    ptile.append(i/100)

unique_routes3[unique_routes3._merge=="right_only"].pct_route.describe(percentiles=ptile)

In [None]:
# There's quite a lot more routes we can add
# in this intersects_shn group
for r in range(20, 35, 5):
    subset = unique_routes3[(unique_routes3._merge=="right_only") &
                            (unique_routes3.pct_route >= r/100)]
        
    print(f"route threshold: {r/100} - {len(subset)}")
    
    make_map(df2[df2.route_id.isin(subset.route_id)])

Depends whether we want another 50% of the routes of the 1,900 routes (`pct_route >= 0.35`)

Looking at LA Metro, we do want to grab all the routes that span big boulevards, and don't want to be too restrictive. Stick with `pct_route >= 0.35`, since that's close to 1/3 of the route, and gives more options for improvements. Grabbing another 50% of the 1,900 routes is ok.

In [None]:
unique_routes3[unique_routes3._merge=="right_only"].itp_id.value_counts()

In [None]:
itp_id = 182

for r in range(25, 50, 5):
    subset = unique_routes3[(unique_routes3._merge=="right_only") &
                            (unique_routes3.pct_route >= r/100)]
        
    print(f"route threshold: {r/100} - {len(subset)}")
    
    make_map(df2[(df2.itp_id==itp_id) & 
                 (df2.route_id.isin(subset.route_id))])