# Grab transit routes near the State Highway Network (SHN)

Find transit routes within 1 mile of the SHN.

Data Sources: 
* [SHN on Geoportal](https://opendata.arcgis.com/datasets/77f2d7ba94e040a78bfbe36feb6279da_0.geojson) > processed in `highway_transit_utils.py` > exported to GCS > saved in catalog.
* Transit routes: all transit routes, those in `gtfs_schedule.shapes` and those not, but found in `stops`. Use `traffic_ops/export_shapefiles.py` created `routes_assembled.parquet` in GCS > saved in catalog.

In [None]:
import geopandas as gpd
import intake
import matplotlib.pyplot as plt
import pandas as pd

import highway_transit_utils
from shared_utils import geography_utils

catalog = intake.open_catalog("*.yml")

IMG_PATH = "./img/"
DATA_PATH = "./data/"

In [None]:
#overlay = highway_transit_utils.overlay_transit_to_highways()

#highways.to_parquet(f"{DATA_PATH}highways.parquet")
#transit_routes.to_parquet(f"{DATA_PATH}transit_routes.parquet")
#overlay.to_parquet(f"{DATA_PATH}overlay.parquet")

In [None]:
gdf = gpd.read_parquet(f"{DATA_PATH}overlay.parquet")
transit_routes = gpd.read_parquet(f"{DATA_PATH}transit_routes.parquet")
highways = gpd.read_parquet(f"{DATA_PATH}highways.parquet")

gdf = gdf.assign(
    pct_route = gdf.geometry.length / gdf.route_length,
    pct_highway = gdf.geometry.length / gdf.highway_length,
)

gdf.head(2)

In [None]:
MAP_ME = {
    182: "LA Metro", 
    294: "SJ Valley Transportation Authority", 
    279: "BART", 
    282: "SF Muni",
    278: "SD Metropolitan Transit System", 
}

for i, name in MAP_ME.items():
    subset_df = gdf[gdf.itp_id==i]
    print(f"# routes originally for {i}: {transit_routes[transit_routes.itp_id==i].route_id.nunique()}")
    print(f"# routes for {i}: {subset_df.route_id.nunique()}")
    
    
    fig, ax = plt.subplots(figsize  = (12, 8))
    subset_df.plot(column="route_id",  
                         ax = ax)
    ax.set_axis_off()
    plt.title(f"{name} (ITP ID: {i})")
    display(fig)
    plt.close()

In [None]:
gdf.pct_highway.describe()

In [None]:
gdf.pct_route.describe()

In [None]:
gdf[gdf.pct_highway > 1].pct_highway.describe()

In [None]:
print(f"# rows: {len(gdf)}")
print(f"# rows with pct_highway > 1 (wrong?!): {len(gdf[gdf.pct_highway > 1])}")
print(f"% rows with pct_highway > 1 (wrong?!): {len(gdf[gdf.pct_highway > 1]) / len(gdf)}")

There are some rows that have `pct_highway` > 1. `pct_route` falls between 0-1, and that's correct. `pct_highway` > 1 means that the transit route runs along that highway, most likely in both directions. 

LA Metro Line 910 (Silver Line) runs on the 110 freeway in both directions.

Has to do with the fact that highway length was calculated only for 1 direction (similar to centerline). Can just set all `pct_highway` > 1 to be 1. The route runs the entirety of the highway segment, and once it hits the max, we can store that as 1. The length was calculated, then the 1 mile buffer drawn, then the dissolve (dissolve is computationally expensive).

In [None]:
gdf[(gdf.pct_highway > 1) & (gdf.itp_id==182) & (gdf.County=="LA")
   ][["route_id", "route_length", "Route", "highway_length",
      "geometry", "pct_route", "pct_highway"]]

In [None]:
# Set pct_highway to have max of 1
gdf = gdf.assign(
    pct_highway = gdf.apply(lambda x: 1 if x.pct_highway > 1 
                            else x.pct_highway, axis=1)
)

gdf.head(2)

In [None]:
def parallel_or_intersecting(df, pct_route_threshold=0.5, pct_highway_threshold=0.1):
    df = df.assign(
        parallel = df.apply(lambda x: 
                            1 if (
                                (x.pct_route >= pct_route_threshold) and 
                                (x.pct_highway >= pct_highway_threshold)
                            ) else 0, axis=1),
    )
    
    return df

In [None]:
orig_highways = (catalog.state_highway_network.read()
                .to_crs(geography_utils.CA_StatePlane))
    

for i in [0, 0.05, 0.1, 0.15, 0.2]:
    gdf2 = parallel_or_intersecting(gdf, pct_route_threshold=0.4, 
                                    pct_highway_threshold=i)
    print(f"highway threshold: {i}")
    print("------------------------------------")
    print(gdf2.parallel.value_counts())
    print(f"%: {len(gdf2[gdf2.parallel==1]) / len(gdf2)}")
    
    for i, name in MAP_ME.items():
        subset_df = gdf2[gdf2.itp_id==i]

        print(f"# routes for {i}: {subset_df.route_id.nunique()}")
        print(f"# routes parallel: {len(subset_df[subset_df.parallel==1]) / len(subset_df)}")
    
        fig, ax = plt.subplots(figsize  = (12, 8))
        orig_highways[
            (orig_highways.Route.isin(subset_df.Route)) & 
            (orig_highways.County.isin(subset_df.County))
        ].drop_duplicates(subset=["Route", "County"]).plot(ax=ax, color="gray")
        
        subset_df.plot(column="parallel", ax = ax, 
                       categorical=True, legend=True)
        
        ax.set_axis_off()
        
        plt.title(f"{name} (ITP ID: {i}, parallel vs intersecting)")
        display(fig)
        plt.close()