## Find routes that cross multiple districts

In [223]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils

import A1_provider_prep
import A2_analysis
import A3_other

In [224]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [225]:
unique_routes = A3_other.load_unique_routes_df()

In [226]:
len(unique_routes), unique_routes.long_route_name.nunique()

(2829, 2829)

In [227]:
districts = A1_provider_prep.get_districts()

In [228]:
districts.crs == unique_routes.crs

True

### Sjoin
* Place unique routes on the L because I want the routes that fall within each district.
* Intersects only returns routes that are 100% in a district. Should I used something else (within,contains,etc)?

In [229]:
def sjoin_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and 
    which don't. 
    """
    unique_routes = A3_other.load_unique_routes_df()
    sjoin =  gpd.sjoin(
        unique_routes, 
        district_df, 
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")
    
    # Get route length after doing sjoin
    sjoin = sjoin.assign(
        sjoin_route_length= sjoin.geometry.to_crs(geography_utils.CA_StatePlane).length
    )
    
    # Get %
    sjoin["route_percentage"] = ((sjoin["sjoin_route_length"] / sjoin["original_route_length"])* 100).astype('int64')
    
    return sjoin

In [230]:
def complete_sjoin_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and 
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    district_df = A1_provider_prep.get_districts()
    
    full_gdf = pd.DataFrame()
    
    for i in [*range(1, 13, 1)]:
        result = sjoin_route_district(district_df[district_df.district==i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)
    
    full_gdf = full_gdf.compute()
    
    return full_gdf

In [231]:
# sjoin_test = 

#### Sjoin Testing w/ D4

In [232]:
# Subset to D4
d4 =  districts[districts.district == 4]

In [233]:
# Sjoin 
# Should it be within? Gives me only routes that fall 100% in d4
d4_sjoin =  gpd.sjoin(
        unique_routes, 
        d4,
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")

In [234]:
d4_sjoin.shape

(893, 8)

In [235]:
# Get route length after doing sjoin
d4_sjoin2 = d4_sjoin.assign(
        sjoin_route_length= d4_sjoin.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

In [236]:
# Get %
d4_sjoin2["percentage_route_covered"] = ((d4_sjoin2["sjoin_route_length"] / d4_sjoin2["original_route_length"])* 100).astype('int64')

In [237]:
# d4_sjoin2.drop(columns = ['geometry']).sort_values("percentage_route_covered")

In [238]:
d4_sjoin2.percentage_route_covered.value_counts()

100    893
Name: percentage_route_covered, dtype: int64

In [239]:
d4_sjoin2.agency.value_counts()

AC Transit                                                131
MUNI                                                      112
SamTrans                                                  100
Santa Clara Valley Transportation Authority                86
Commute.org Shuttles                                       48
County Connection                                          44
Marin Transit                                              44
Sonoma County Transit                                      42
Tri-Valley Wheels                                          27
Cloverdale Transit                                         23
Marguerite Shuttle                                         21
WestCAT                                                    20
Golden Gate Bridge Highway and Transportation District     18
SolanoExpress                                              17
SolTrans                                                   17
Tri Delta Transit                                          17
MVGO    

### Clip

In [240]:
def clip_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and 
    which don't. 
    """
    # Load unique routes
    unique_routes = A3_other.load_unique_routes_df()
    
    # Clip routes against a district
    clipped = gpd.clip(unique_routes, district_df)
    
    # Get route length after doing clip
    clipped = clipped.assign(
        clipped_route_length= clipped.geometry.to_crs(geography_utils.CA_StatePlane).length
    )
    
    # Get %
    clipped["route_percentage"] = ((clipped["clipped_route_length"] / clipped["original_route_length"])* 100).astype('int64')
    
    return clipped

In [241]:
def complete_clip_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and 
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    # Load districts
    district_df = A1_provider_prep.get_districts()
    
    full_gdf = pd.DataFrame()
    
    for i in [*range(1, 13, 1)]:
        result = clip_route_district(district_df[district_df.district==i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)
    
    full_gdf = full_gdf.compute()
    
    return full_gdf

In [242]:
clip_test = complete_clip_route_district()

In [243]:
# How are there more routes now? 
# Original unique routes only has 2829 rows, now there are  2914 rows
# Only 2821 unique long route names here but original unique routes has 2829 unique routes
clip_test.shape, clip_test.long_route_name.nunique()

((2914, 9), 2821)

In [262]:
len(clip_test) - len(unique_routes)

85

In [244]:
# Exploring why are there 9 less routes after clip_test...
# The routes not saved after clip are ones in Arizona!
clip_routes_list = set(clip_test.long_route_name.unique().tolist())
unique_routes_list = set(unique_routes.long_route_name.unique().tolist())

In [245]:
unique_routes_list - clip_routes_list

{'Arizona Western College/Northern Arizona University/University of Arizona to Somerton and San Luis via County 14th Street 17  Yuma County Area Transit',
 'Arizona Western College/Northern Arizona University/University of Arizona to Wellton via Fortuna Foothills 16  Yuma County Area Transit',
 'Clockwise Loop from West Yuma Transfer Hub to Downtown Yuma Transit Center 12  Yuma County Area Transit',
 'Counter Clockwise Loop from West Yuma Transfer Hub to Downtown Yuma Transit Center 28  Yuma County Area Transit',
 'Downtown Yuma Transit Center to Fortuna Foothills via Arizona Western College, University of Arizona and Northern Arizona University and 32nd Street 10  Yuma County Area Transit',
 'Highway 95 South - Downtown Yuma Transit Center to San Luis 11  Yuma County Area Transit',
 'North Cocopah Reservation to West Cocopah Reservation via East Cocopah Reservation and Avenue A 14  Yuma County Area Transit',
 'The Garibaldi 5490  Avalon Transit'}

In [246]:
clip_test.route_percentage.value_counts().head(20)

100    1337
99      601
98      237
97      119
95       67
96       55
94       49
93       34
92       29
91       21
88       21
86       18
67       16
89       15
90       15
83       13
53       12
70       11
87       10
84       10
Name: route_percentage, dtype: int64

In [247]:
# Just checking routes that are less than 90% in a district
less_than_90 = clip_test.loc[clip_test.route_percentage < 90].sort_values("route_percentage")

In [248]:
len(less_than_90)

350

In [249]:
m = districts.explore(color="#8CBCCB", name="districts",  width=800,
    height=400,)
m = less_than_100.explore(m=m,)

In [250]:
# m

In [251]:
less_than_100_map = less_than_100.explore(
    "long_route_name",
    width=800,
    height=400,
    style_kwds={"weight": 6},
    legend=False,)

#### Clip Testing w/ D4

In [252]:
# Try with clip instead
d4_clip = gpd.clip( unique_routes, d4)

In [253]:
d4_clip.shape

(893, 7)

In [254]:
d4_clip2 = d4_clip.assign(
        clip_route_length= d4_clip.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

In [255]:
# Get %
d4_clip2["percentage_route_covered"] = ((d4_clip2["clip_route_length"] / d4_clip2["original_route_length"])* 100).astype('int64')

In [256]:
# Filter for only routes that have <85% in D4
low_coverage = d4_clip2.loc[d4_clip2.percentage_route_covered <85].sort_values("percentage_route_covered")

In [257]:
# https://stackoverflow.com/questions/73767559/geopandas-explore-how-to-plot-more-than-one-layer
# Certain routes in here are clearly only in D4 yet their percentage is low...
m = d4.explore(color="#8CBCCB", name="D4",  width=400,
    height=250,)
m = low_coverage.explore(m=m,)
# m

### Clip has more rows than Sjoin? 
* Seems like some Yolobus/Mendicino (d3 and d1) stuff extends to d4.

In [258]:
# check to see if 2 columns are the same
clip_routes = set(d4_clip.long_route_name.unique().tolist())
sjoin_routes = set(d4_sjoin.long_route_name.unique().tolist())

In [259]:
clip_routes - sjoin_routes

set()

In [260]:
 sjoin_routes -clip_routes 

set()