## Find routes that cross multiple districts

In [1]:
import A1_provider_prep
import A2_other
import A3_analysis
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
import shared_utils
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
one_dist_routes, multi_dist_routes, all_routes = A2_other.find_multi_district_routes()
districts = A1_provider_prep.get_districts()

### Clip
* How can I keep district info? Or incorporate which district the route falls in?
* Is unique routes and district in the right order, L and R?
* How can there be routes that don't add up to even 90% after grouping by long route name??
* A route must fall under ONE district completely?

In [5]:
one_dist_routes.drop(columns = ["geometry"]).sample()

Unnamed: 0,itp_id,route_id,route_name,agency,original_route_length,long_route_name,clipped_route_length,route_percentage,District
1683,171,13058,,Avocado Heights/Bassett/West Valinda Shuttle,38698.8,None 13058 Avocado Heights/Bassett/West Valinda Shuttle,38698.8,100,D-7


In [6]:
len(one_dist_routes) + len(multi_dist_routes) == len(all_routes)

True

In [7]:
multi_dist_routes.long_route_name.nunique(), len(multi_dist_routes)

(90, 183)

In [8]:
n = districts.explore(
    "district",
    tiles = "CartoDB positron",
    cmap = "Blues",
    name="districts",
    width=800,
    height=400,
)

In [9]:

n = multi_dist_routes.explore("long_route_name",
    m=n, legend = False
)

In [11]:
# n

#### Clip Testing w/ D4

In [None]:
# Try with clip instead
d4_clip = gpd.clip(unique_routes, d4)

In [None]:
d4_clip2 = d4_clip.assign(
    clip_route_length=d4_clip.geometry.to_crs(geography_utils.CA_StatePlane).length
)

In [None]:
# Get %
d4_clip2["route_percentage"] = (
    (d4_clip2["clip_route_length"] / d4_clip2["original_route_length"]) * 100
).astype("int64")

In [None]:
d4_clip2.shape

In [None]:
# d4_clip2.route_percentage.value_counts()

In [None]:
# Filter for only routes that have whatever percent in D4
low_coverage_d4 = d4_clip2.loc[d4_clip2.route_percentage < 85].sort_values(
    "route_percentage"
)

In [None]:
# Grab routes that is 85% or less of original route length into a list
low_coverage_d4_list = low_coverage_d4.long_route_name.unique().tolist()

In [None]:
# Filter out for the routes above in original unique_routes
# So I can map original route geometries
low_coverage_d4_og_geometry = (
    unique_routes[unique_routes["long_route_name"].isin(low_coverage_d4_list)]
).reset_index(drop=True)

In [None]:
low_coverage_d4_og_geometry.shape

In [None]:
# https://stackoverflow.com/questions/73767559/geopandas-explore-how-to-plot-more-than-one-layer
# Routes plotted with original geometry
# Certain routes in here are clearly only in D4 yet their percentage is low...
m = d4.explore(
    color="#F4D837",
    tiles="CartoDB positron",
    name="D4",
    width=800,
    height=500,
)

# Route: original length
m = low_coverage_d4_og_geometry.explore(
    m=m,
)

# Route: clipped length
m = low_coverage_d4.explore(
    m=m, color = "red", style_kwds = {'opacity': 0.5, 'weight':3})

In [None]:
m

In [None]:
# check to see if 2 columns are the same
# d4_clip_routes_set = set(low_coverage_d4_list)
# low_percent_routes_set = set(low_percent_routes_list)

In [None]:
# d4_clip_routes_set - low_percent_routes_set

In [None]:
# Check some of the routes that are found in d4 test versus clip_test
# How is CC RIder 231 now 95% in a certain district?
# clip_test.drop(columns = ["geometry"])[clip_test.long_route_name == 'CC Rider 231  Mendocino Transit Authority']

In [None]:
# d4_clip2.drop(columns = ["geometry"])[d4_clip2.long_route_name == 'CC Rider 231  Mendocino Transit Authority']

In [None]:
# clip_test.drop(columns = ["geometry"])[clip_test.long_route_name == 'South Coast / Santa Rosa 224  Mendocino Transit Authority']

In [None]:
# Check routes with test d4
# d4_clip2.drop(columns = ["geometry"])[d4_clip2.long_route_name == 'South Coast / Santa Rosa 224  Mendocino Transit Authority']

### Sjoin - Test
* Ended up using clip instead.
* Place unique routes on the L because I want the routes that fall within each district.
* Intersects only returns routes that are 100% in a district. Should I used something else (within,contains,etc)?
* Sjoin does *not* change geometry.

In [32]:
def sjoin_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and
    which don't.
    """
    unique_routes = A3_other.load_unique_routes_df()
    sjoin = gpd.sjoin(
        unique_routes, district_df, how="inner", predicate="intersects"
    ).drop(columns="index_right")

    # Get route length after doing sjoin
    sjoin = sjoin.assign(
        sjoin_route_length=sjoin.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

    # Get %
    sjoin["route_percentage"] = (
        (sjoin["sjoin_route_length"] / sjoin["original_route_length"]) * 100
    ).astype("int64")

    return sjoin

In [33]:
def complete_sjoin_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    district_df = A1_provider_prep.get_districts()

    full_gdf = pd.DataFrame()

    for i in [*range(1, 13, 1)]:
        result = sjoin_route_district(district_df[district_df.district == i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)

    full_gdf = full_gdf.compute()

    return full_gdf

In [34]:
# sjoin_test =

#### Sjoin Testing w/ D4

In [35]:
# Subset to D4
d4 = districts[districts.district == 4]

In [36]:
# Sjoin
# Should it be within? Gives me only routes that fall 100% in d4
# d4_sjoin = gpd.sjoin(unique_routes, d4, how="inner", predicate="intersects").drop(
#     columns="index_right"
# )

In [37]:
# d4_sjoin2.drop(columns = ['geometry']).sort_values("percentage_route_covered")

In [38]:
# d4_sjoin.shape

In [39]:
# d4_sjoin.agency.value_counts()

### Scratch Area

In [None]:
def clip_route_district():
    """
    Find which routes fall 100% neatly in an district and
    which cross district boundaries by each district.
    Stack the seperated district results back together.
    """
    # Load unique routes & districts
    unique_routes = A3_other.load_unique_routes_df()
    district_df = A1_provider_prep.get_districts()
    
    # Clip routes against a district
    clipped = gpd.clip(unique_routes, district_df)

    # Get route length after doing clip
    clipped = clipped.assign(
        clipped_route_length=clipped.geometry.to_crs(
            geography_utils.CA_StatePlane
        ).length
    )

    # Get %
    clipped["route_percentage"] = (
        (clipped["clipped_route_length"] / clipped["original_route_length"]) * 100
    ).astype("int64")

    return clipped

In [None]:
# clip_d1_d4 = complete_clip_route_district()