## Find routes that cross multiple districts

In [46]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils

import A1_provider_prep
import A2_analysis
import A3_other

In [47]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [48]:
unique_routes = A3_other.load_unique_routes_df()

In [49]:
len(unique_routes), unique_routes.long_route_name.nunique()

(2829, 2829)

In [50]:
districts = A1_provider_prep.get_districts()

In [51]:
districts.crs == unique_routes.crs

True

### Sjoin
* Place unique routes on the L because I want the routes that fall within each district.
* Intersects only returns routes that are 100% in a district. Should I used something else (within,contains,etc)?

In [52]:
def sjoin_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and 
    which don't. 
    """
    unique_routes = A3_other.load_unique_routes_df()
    sjoin =  gpd.sjoin(
        unique_routes, 
        district_df, 
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")
    
    # Get route length after doing sjoin
    sjoin = sjoin.assign(
        sjoin_route_length= sjoin.geometry.to_crs(geography_utils.CA_StatePlane).length
    )
    
    # Get %
    sjoin["route_percentage"] = ((sjoin["sjoin_route_length"] / sjoin["original_route_length"])* 100).astype('int64')
    
    return sjoin

In [53]:
def complete_sjoin_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and 
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    district_df = A1_provider_prep.get_districts()
    
    full_gdf = pd.DataFrame()
    
    for i in [*range(1, 13, 1)]:
        result = sjoin_route_district(district_df[district_df.district==i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)
    
    full_gdf = full_gdf.compute()
    
    return full_gdf

In [54]:
# sjoin_test = 

#### Sjoin Testing w/ D4

In [55]:
# Subset to D4
d4 =  districts[districts.district == 4]

In [56]:
# Sjoin 
# Should it be within? Gives me only routes that fall 100% in d4
d4_sjoin =  gpd.sjoin(
        unique_routes, 
        d4,
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")

In [58]:
# d4_sjoin2.drop(columns = ['geometry']).sort_values("percentage_route_covered")

In [83]:
d4_sjoin.shape

(893, 8)

In [59]:
d4_sjoin2.agency.value_counts()

AC Transit                                                131
MUNI                                                      112
SamTrans                                                  100
Santa Clara Valley Transportation Authority                86
Commute.org Shuttles                                       48
County Connection                                          44
Marin Transit                                              44
Sonoma County Transit                                      42
Tri-Valley Wheels                                          27
Cloverdale Transit                                         23
Marguerite Shuttle                                         21
WestCAT                                                    20
Golden Gate Bridge Highway and Transportation District     18
SolanoExpress                                              17
SolTrans                                                   17
Tri Delta Transit                                          17
MVGO    

### Clip
* How can I keep district info? Or incorporate which district the route falls in?

In [60]:
def clip_route_district(district_df: gpd.GeoDataFrame):
    """
    Find which routes fall 100% neatly in an district and 
    which don't. 
    """
    # Load unique routes
    unique_routes = A3_other.load_unique_routes_df()
    
    # Clip routes against a district
    clipped = gpd.clip(unique_routes, district_df)
    
    # Get route length after doing clip
    clipped = clipped.assign(
        clipped_route_length= clipped.geometry.to_crs(geography_utils.CA_StatePlane).length
    )
    
    # Get %
    clipped["route_percentage"] = ((clipped["clipped_route_length"] / clipped["original_route_length"])* 100).astype('int64')
    
    return clipped

In [61]:
def complete_clip_route_district() -> dg.GeoDataFrame:
    """
    Find which routes fall 100% neatly in an district and 
    which cross district boundaries. Stack results altogether.
    Filter out for routes that fall in mulitple districts.
    """
    # Load districts
    district_df = A1_provider_prep.get_districts()
    
    full_gdf = pd.DataFrame()
    
    for i in [*range(1, 13, 1)]:
        result = clip_route_district(district_df[district_df.district==i])

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)
    
    full_gdf = full_gdf.compute()
    
    return full_gdf

In [62]:
clip_test = complete_clip_route_district()

In [63]:
# How are there more routes now? 
# Original unique routes only has 2829 rows, now there are  2914 rows
# Only 2821 unique long route names here but original unique routes has 2829 unique routes
clip_test.shape, clip_test.long_route_name.nunique()

((2914, 9), 2821)

In [64]:
len(clip_test) - len(unique_routes)

85

In [65]:
# Exploring why are there 9 less routes after clip_test...
# The routes not saved after clip are ones in Arizona!
clip_routes_list = set(clip_test.long_route_name.unique().tolist())
unique_routes_list = set(unique_routes.long_route_name.unique().tolist())

In [66]:
unique_routes_list - clip_routes_list

{'Arizona Western College/Northern Arizona University/University of Arizona to Somerton and San Luis via County 14th Street 17  Yuma County Area Transit',
 'Arizona Western College/Northern Arizona University/University of Arizona to Wellton via Fortuna Foothills 16  Yuma County Area Transit',
 'Clockwise Loop from West Yuma Transfer Hub to Downtown Yuma Transit Center 12  Yuma County Area Transit',
 'Counter Clockwise Loop from West Yuma Transfer Hub to Downtown Yuma Transit Center 28  Yuma County Area Transit',
 'Downtown Yuma Transit Center to Fortuna Foothills via Arizona Western College, University of Arizona and Northern Arizona University and 32nd Street 10  Yuma County Area Transit',
 'Highway 95 South - Downtown Yuma Transit Center to San Luis 11  Yuma County Area Transit',
 'North Cocopah Reservation to West Cocopah Reservation via East Cocopah Reservation and Avenue A 14  Yuma County Area Transit',
 'The Garibaldi 5490  Avalon Transit'}

In [67]:
clip_test.route_percentage.value_counts().head(20)

100    1337
99      601
98      237
97      119
95       67
96       55
94       49
93       34
92       29
91       21
88       21
86       18
67       16
89       15
90       15
83       13
53       12
70       11
87       10
84       10
Name: route_percentage, dtype: int64

In [68]:
# Just checking routes that are less than 90% in a district
less_than_90 = clip_test.loc[clip_test.route_percentage < 90].sort_values("route_percentage")

In [69]:
len(less_than_90)

350

In [120]:
# Look at the routes with less than 90% in a district, make sure the percetnage adds up to 100%
grouped_low_coverage = less_than_90.groupby(['long_route_name']).agg({'route_percentage':'sum'}).reset_index()

In [122]:
# How can there be routes that don't add up to even 90% after grouping by long route name??
# These routes must fall under ONE district completely? 
# Looking at Glenn-Tehama Connect 12874 Tehama Rural Area eXpress, it falls mostly in D2 and a bit in D3. 
grouped_low_coverage.sort_values('route_percentage')

Unnamed: 0,long_route_name,route_percentage
131,Los Banos-Dos Palos Commuter Service 19 Merced The Bus,0
243,South Coast / Ukiah 225 Mendocino Transit Authority,1
72,DAVIS EXPRESS - 230 PM de912f9c-dd88-4c9f-9193-1d551c662830 Yolobus,1
129,Los Alamitos - Orange via Ball Road/Taft 46 Irvine Shuttle,2
130,Los Alamitos - Orange via Ball Road/Taft 46 Orange County Transportation Authority,2
113,La Habra - Huntington Beach via Beach Blvd 29 Orange County Transportation Authority,3
112,La Habra - Huntington Beach via Beach Blvd 29 Irvine Shuttle,3
99,Goldenwest Transportation Center - Anaheim Canyon Metrolink Station 123 Orange County Transportation Authority,5
4,395 Route South 562 Eastern Sierra Transit Authority,5
5,395 Route South 562 Mammoth Lakes Transit System,5


In [125]:
less_than_90_routes = less_than_90.long_route_name.unique().tolist()

In [127]:
less_than_90_routes_og_geo = (unique_routes[unique_routes["long_route_name"].isin(less_than_90_routes)]).reset_index(drop = True)

In [132]:
m = districts.explore("district", name="districts",  width=800,
    height=400,)
m = less_than_90_routes_og_geo.explore(m=m,)

In [134]:
# m

#### Clip Testing w/ D4

In [74]:
# Try with clip instead
d4_clip = gpd.clip(unique_routes, d4)

In [76]:
d4_clip2 = d4_clip.assign(
        clip_route_length= d4_clip.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

In [112]:
# Get %
d4_clip2["route_percentage"] = ((d4_clip2["clip_route_length"] / d4_clip2["original_route_length"])* 100).astype('int64')

In [113]:
d4_clip.shape

(893, 7)

In [114]:
d4_clip2.route_percentage.value_counts()

100    443
99     241
98      57
97      33
94      16
95      16
96      16
92       9
88       8
93       5
86       5
89       5
90       5
83       5
91       4
84       4
67       2
87       2
76       2
1        2
5        1
9        1
11       1
29       1
28       1
61       1
33       1
34       1
85       1
35       1
78       1
52       1
44       1
Name: route_percentage, dtype: int64

In [102]:
# Filter for only routes that have whatever percent in D4
low_coverage = d4_clip2.loc[d4_clip2.percentage_route_covered < 85].sort_values("route_percentage")

In [103]:
# Grab routes that is 85% or less of original route length into a list 
low_coverage_d4_list = low_coverage.long_route_name.unique().tolist()

In [104]:
# Filter out for the routes above in original unique_routes
# So I can map original route geometries
low_coverage_d4_og_geometry = (unique_routes[unique_routes["long_route_name"].isin(low_coverage_d4_list)]).reset_index(drop = True)

In [105]:
low_coverage_d4_og_geometry.shape

(27, 7)

In [106]:
# https://stackoverflow.com/questions/73767559/geopandas-explore-how-to-plot-more-than-one-layer
# Certain routes in here are clearly only in D4 yet their percentage is low...
m = d4.explore(color="#8CBCCB", name="D4",  width=600,
    height=350,)
m = low_coverage_d4_og_geometry.explore(m=m,)

In [123]:
# m

In [110]:
# https://stackoverflow.com/questions/73767559/geopandas-explore-how-to-plot-more-than-one-layer
# Certain routes in here are clearly only in D4 yet their percentage is low...
m = d4.explore(color="#8CBCCB", name="D4",  width=600,
    height=350,)
a = low_coverage.explore(m=m,)
a

### Clip has more rows than Sjoin? 
* Seems like some Yolobus/Mendicino (d3 and d1) stuff extends to d4.

In [80]:
# check to see if 2 columns are the same
clip_routes = set(d4_clip.long_route_name.unique().tolist())
sjoin_routes = set(d4_sjoin.long_route_name.unique().tolist())

In [81]:
clip_routes - sjoin_routes

set()

In [82]:
 sjoin_routes -clip_routes 

set()