## Find routes that cross multiple districts

In [None]:
import A1_provider_prep
import A2_analysis
import A3_other
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
import shared_utils
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
unique_routes = A3_other.load_unique_routes_df()

In [None]:
len(unique_routes), unique_routes.long_route_name.nunique()

In [None]:
districts = A1_provider_prep.get_districts()

In [None]:
districts.crs == unique_routes.crs

### Sjoin
* Place unique routes on the L because I want the routes that fall within each district.
* Intersects only returns routes that are 100% in a district. Should I used something else (within,contains,etc)?

In [None]:
def sjoin_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and
    which don't.
    """
    unique_routes = A3_other.load_unique_routes_df()
    sjoin = gpd.sjoin(
        unique_routes, district_df, how="inner", predicate="intersects"
    ).drop(columns="index_right")

    # Get route length after doing sjoin
    sjoin = sjoin.assign(
        sjoin_route_length=sjoin.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

    # Get %
    sjoin["route_percentage"] = (
        (sjoin["sjoin_route_length"] / sjoin["original_route_length"]) * 100
    ).astype("int64")

    return sjoin

In [None]:
def complete_sjoin_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    district_df = A1_provider_prep.get_districts()

    full_gdf = pd.DataFrame()

    for i in [*range(1, 13, 1)]:
        result = sjoin_route_district(district_df[district_df.district == i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)

    full_gdf = full_gdf.compute()

    return full_gdf

In [None]:
# sjoin_test =

#### Sjoin Testing w/ D4

In [None]:
# Subset to D4
d4 = districts[districts.district == 4]

In [None]:
# Sjoin
# Should it be within? Gives me only routes that fall 100% in d4
d4_sjoin = gpd.sjoin(unique_routes, d4, how="inner", predicate="intersects").drop(
    columns="index_right"
)

In [None]:
# d4_sjoin2.drop(columns = ['geometry']).sort_values("percentage_route_covered")

In [None]:
d4_sjoin.shape

In [None]:
d4_sjoin.agency.value_counts()

### Clip
* How can I keep district info? Or incorporate which district the route falls in?
* Is unique routes and district in the right order, L and R?
* How can there be routes that don't add up to even 90% after grouping by long route name??
* A route must fall under ONE district completely?

In [None]:
def clip_route_district():
    """
    Find which routes fall 100% neatly in an district and
    which cross district boundaries by each district.
    Stack the seperated district results back together.
    """
    # Load unique routes & districts
    unique_routes = A3_other.load_unique_routes_df()
    district_df = A1_provider_prep.get_districts()
    
    # Clip routes against a district
    clipped = gpd.clip(unique_routes, district_df)

    # Get route length after doing clip
    clipped = clipped.assign(
        clipped_route_length=clipped.geometry.to_crs(
            geography_utils.CA_StatePlane
        ).length
    )

    # Get %
    clipped["route_percentage"] = (
        (clipped["clipped_route_length"] / clipped["original_route_length"]) * 100
    ).astype("int64")

    return clipped

In [None]:
# Started 10:15, ended at 10:16
clip_test = clip_route_district()

In [None]:
# clip_test = complete_clip_route_district()

In [None]:
# How are there more routes now?
# Original unique routes only has 2829 rows, now there are  2914 rows
# Only 2821 unique long route names here but original unique routes has 2829 unique routes
clip_test.shape, clip_test.long_route_name.nunique()

In [None]:
# 8 missing rows
len(clip_test) - len(unique_routes)

In [None]:
# Understanding why are there 9 less routes after clip_test...
# The routes not saved after clip are ones in Arizona!
clip_routes_list = set(clip_test.long_route_name.unique().tolist())
unique_routes_list = set(unique_routes.long_route_name.unique().tolist())

In [None]:
unique_routes_list - clip_routes_list, len(unique_routes_list - clip_routes_list)

In [None]:
# Just checking routes that are less than 90% in a district
low_percent = (
    clip_test.loc[clip_test.route_percentage < 85].sort_values("route_percentage")
).reset_index(drop=True)

In [None]:
# Strange, I would assume there would be duplicate long route names
# Since parts of a route  would fall in different districst and be
# split into 1+ rows.
len(low_percent),low_percent.long_route_name.nunique()

In [None]:
# Look at the routes with less than 90% in a district, make sure the percentage adds up to 100%
grouped_low_percent = (
    low_percent.groupby(["long_route_name"])
    .agg({"route_percentage": "sum"})
    .reset_index()
)


In [None]:
# Looking at Glenn-Tehama Connect 12874 Tehama Rural Area eXpress, it falls mostly in D2 and a bit in D3.
# grouped_low_coverage.sort_values("route_percentage")

In [None]:
# Grab route names with low percentage
low_percent_routes_list = low_percent.long_route_name.unique().tolist()

In [None]:
# Filter the routes above in the original routes df to get original geometry
low_percent_routes_og_geo = (
    unique_routes[unique_routes["long_route_name"].isin(low_percent_routes_list)]
).reset_index(drop=True)

In [None]:
k = districts.explore(
    "district",
    tiles = "CartoDB positron",
    cmap = "Blues",
    name="districts",
    width=800,
    height=400,
)
m = low_percent_routes_og_geo.explore(
    m=m,
)

In [None]:
m

#### Clip Testing w/ D4

In [None]:
# Try with clip instead
d4_clip = gpd.clip(unique_routes, d4)

In [None]:
d4_clip2 = d4_clip.assign(
    clip_route_length=d4_clip.geometry.to_crs(geography_utils.CA_StatePlane).length
)

In [None]:
# Get %
d4_clip2["route_percentage"] = (
    (d4_clip2["clip_route_length"] / d4_clip2["original_route_length"]) * 100
).astype("int64")

In [None]:
d4_clip.shape

In [None]:
d4_clip2.route_percentage.value_counts()

In [None]:
# Filter for only routes that have whatever percent in D4
low_coverage_d4 = d4_clip2.loc[d4_clip2.route_percentage < 85].sort_values(
    "route_percentage"
)

In [None]:
# Grab routes that is 85% or less of original route length into a list
low_coverage_d4_list = low_coverage_d4.long_route_name.unique().tolist()

In [None]:
# Filter out for the routes above in original unique_routes
# So I can map original route geometries
low_coverage_d4_og_geometry = (
    unique_routes[unique_routes["long_route_name"].isin(low_coverage_d4_list)]
).reset_index(drop=True)

In [None]:
low_coverage_d4_og_geometry.shape

In [None]:
# https://stackoverflow.com/questions/73767559/geopandas-explore-how-to-plot-more-than-one-layer
# Routes plotted with original geometry
# Certain routes in here are clearly only in D4 yet their percentage is low...
m = d4.explore(
    color="#F4D837",
    tiles="CartoDB positron",
    name="D4",
    width=800,
    height=500,
)
m = low_coverage_d4_og_geometry.explore(
    m=m,
)

In [None]:
m

### D4 Clip has routes that are different than the Clip_Test? How? 


In [None]:
# check to see if 2 columns are the same
d4_clip_routes_set = set(low_coverage_d4_list)
low_percent_routes_set = set(low_percent_routes_list)

In [None]:
d4_clip_routes_set - low_percent_routes_set

In [None]:
def complete_clip_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and
    which cross district boundaries by each district.
    Stack the seperated district results back together.
    """
    # Load districts
    district_df = A1_provider_prep.get_districts()

    full_gdf = pd.DataFrame()

    for i in [*range(1, 13, 1)]:
        result = clip_route_district(district_df[district_df.district == i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)

    full_gdf = full_gdf.compute()

    return full_gdf