## Clip sjoin districts with original district file

In [None]:
import A1_provider_prep
import A2_other
import A3_analysis
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils
from calitp.storage import get_fs
fs = get_fs()

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", None)

In [None]:
# Get routes
one_dist_routes, multi_dist_routes, all_routes = A2_other.find_multi_district_routes()

In [None]:
# Open original district
districts = A1_provider_prep.get_districts()

### Overlay Entire T-Mobile Map as a test
* Grab all the `tmobile_no_coverage...` parquet files for each district.
* Clip each of those no coverage districts against original district shapefiles.
* Stack all the clipped shapefiles above to create a map for the entire California.
* Try to concat all intersecting routes, then filter if they have a low intersection % in another way.
* The lower the %  of the route covered, the better. That means a lower portion of the route is in a no coverage zone.
* Areas that are covered are white, areas w/o coverage are blue.

In [None]:
def find_specific_files(phrase_to_find:str):
    """
    Grab a list of files that contain the 
    phrase inputted in the function.
    """
    # Create a list of all the files in my folder
    all_files_in_folder = fs.ls(A1_provider_prep.GCS_FILE_PATH)
    
    # Grab only files with the string "Verizon_no_coverage_"
    my_files = [i for i in all_files_in_folder if phrase_to_find in i]
    
    # String to add to read the Verizon only files
    my_string = "gs://"
    my_files = [my_string + i for i in my_files]
    
    return my_files

In [None]:
# t_mobile_files = find_specific_files("tmobile_no_coverage_d")

#### Overlay routes 

In [None]:
def overlay_1_dist_routes(phrase_to_find:str, provider: str):
    """
    Take each district file created from the 
    function `complete_difference_provider_district_level` for a provider
    and overlay the file against the routes that only run in
    one district.Join all the everything together to create
    a full California map for each provider.
    """
    # Open original Caltrans districts shapefile
    # Get rid of A1_provider_prep once I export this
    districts = A1_provider_prep.get_districts()
    
    # Get routes
    one_dist_routes, multi_dist_routes, all_routes = A2_other.find_multi_district_routes()
    
    # Empty gdf to hold results
    routes_overlay_gdf = pd.DataFrame()
    
    # Get a list of files I want
    provider_files_list = find_specific_files(phrase_to_find)
    
    # Loop over every file
    # Testing with just Bay Area for now 
    # Put provider_files_list later.
    for file in provider_files_list:
        # Find which district info each file contains. 
        # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        district = ''.join(i for i in file if i.isdigit())
        
        # Turn this into an integer
        district = int(district)
        
        # Open file
        district_file = gpd.read_parquet(file)
        
        # Clip no coverage file against original district shape
        overlay_routes = A3_analysis.comparison(one_dist_routes, district_file)
        
        # Add information about which district and which provider
        overlay_routes["Overlay-District"] = f"D-{district}"
        overlay_routes["Overlay-Provider"] = provider
        # Do the 2 columns match? 
        overlay_routes["do_districts_match"] = overlay_routes["District"] == overlay_routes["Overlay-District"]
        
        # Concat
        routes_overlay_gdf = pd.concat([routes_overlay_gdf, overlay_routes], axis=0)
        
        print(f"Done with District {district}")
    
    return routes_overlay_gdf

In [None]:
# tmobile_test = overlay_1_dist_routes("tmobile_no_coverage_d", "tmobile")

In [None]:
verizon_test = overlay_1_dist_routes("verizon_no_coverage_d", "verizon")

In [None]:
att_test = overlay_1_dist_routes("ATT_no_coverage_d", "att")

In [None]:
# Only keep routes where District (original district) matches Overlay-District
# tmobile_test["do_districts_match"] = tmobile_test["District"] == tmobile_test["Overlay-District"]

In [None]:
tmobile_test_filtered = tmobile_test.loc[tmobile_test.do_districts_match == True] 

In [None]:
len(tmobile_test_filtered)

In [None]:
# tmobile_test.loc[tmobile_test.do_districts_match == False].drop(columns = ["geometry"]).sort_values(['agency','long_route_name', 'District', 'Overlay-District','percentage_route_covered']).head(100)

In [None]:
# tmobile_test_filtered.drop(columns = ["geometry"]).sort_values(['agency','long_route_name', 'District', 'Overlay-District','percentage_route_covered'])

In [None]:
tmobile_test_filtered_routes_list = tmobile_test_filtered.long_route_name.tolist()

In [None]:
one_dist_routes_filtered = one_dist_routes[one_dist_routes["long_route_name"].isin(tmobile_test_filtered_routes_list)]

In [None]:
t_d11 = gpd.read_parquet(
    f"{A1_provider_prep.GCS_FILE_PATH}tmobile_no_coverage_d11.parquet"
 )

In [None]:
m = t_d11.explore(

    tiles="CartoDB positron",
    width=800,
    height=500,legend = False
)

In [None]:
m = one_dist_routes_filtered.explore(
    m=m,
)

In [None]:
# m

### D6-D9-D10 T-Mobile Test
* After clipping `sjoin-difference` files against the original CT shapefiles, this creates an overly sparse map.
* However, without clipping, the `sjoin-differnce` files contains parts of other districts and these areas are mistakenly regarded as areas without any cellular coverage.

In [None]:
# Read in T-mobile file
#t_d9 = gpd.read_parquet(
#    f"{A1_provider_prep.GCS_FILE_PATH}tmobile_no_coverage_d9.parquet"
# )

In [None]:
#t_d6 = gpd.read_parquet(
#    f"{A1_provider_prep.GCS_FILE_PATH}tmobile_no_coverage_d6.parquet"
# )

In [None]:
# t_d9.plot()

In [None]:
# d9 = (districts.loc[districts.district == 9]).reset_index()

In [None]:
# d9.plot()

In [None]:
# t_d9_clipped = gpd.clip(t_d9, d9, keep_geom_type=False)

In [None]:
# t_d9_clipped.plot()

In [None]:
# routes_no_coverage_d9 = A3_analysis.comparison(one_dist_routes, t_d9)

In [None]:
#routes_no_coverage_d9["district"] = "D9"

In [None]:
#routes_no_coverage_d10 = A3_analysis.comparison(all_routes, t_d10)

In [None]:
#routes_no_coverage_d10["district"] = "D10"

In [None]:
#routes_no_coverage_d6 = A3_analysis.comparison(all_routes, t_d6)

In [None]:
#routes_no_coverage_d6["district"] = "D6"

In [None]:
#t_d9_10 = pd.concat([routes_no_coverage_d6, routes_no_coverage_d9, routes_no_coverage_d10])

In [None]:
#len(t_d9_10)

In [None]:
# t_d9_10.drop(columns = ["geometry"]).sort_values(["percentage_route_covered","long_route_name", ])

In [None]:
#value_counts = A2_other.turn_counts_to_df(t_d9_10, "long_route_name")

### T-Mobile Clip `sjoin-difference` results

In [None]:
def clip_no_coverage_files(phrase_to_find:str, provider:str):
    """
    Take each district file created from the 
    function `complete_difference_provider_district_level` 
    and clip the file against the original
    district shape to get rid of excess edges
    created by the `sjoin`. Join all the districts to create
    a full California map for each provider.
    """
    # Open original Caltrans districts shapefile
    # Get rid of A1_provider_prep
    districts = A1_provider_prep.get_districts()
    
    # Empty gdf to hold results
    california_gdf = pd.DataFrame()
    
    # Get a list of files I want
    provider_files_list = find_specific_files(phrase_to_find)
    
    # Loop over every file
    # Testing with just Bay Area for now 
    # Put provider_files_list later.
    for file in provider_files_list:
        # Find which district info each file contains. 
        # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        district_file_path = ''.join(i for i in file if i.isdigit())
        # Turn this into an integer
        district_file_path = int(district_file_path)
        # Take district_file_path and filter out district gdf
        district_of_interest = (districts.loc[districts.district == district_file_path]).reset_index()
        # Open file
        no_coverage_file = gpd.read_parquet(file)
        # Clip no coverage file against original district shape
        file_clipped = gpd.clip(no_coverage_file, district_of_interest, keep_geom_type=False)
        # Concat
        california_gdf = pd.concat([california_gdf, file_clipped], axis=0)
        print(f"Done with District {district_file_path}")
        
    # Export
    # utils.geoparquet_gcs_export(california_gdf, A1_provider_prep.GCS_FILE_PATH, f"{provider}_clipped_og_district_sjoin") 
    
    return california_gdf

In [None]:
# entire_tmobile = clip_no_coverage_files("tmobile_no_coverage_d", "tmobile")

In [None]:
entire_tmobile.plot()

In [None]:
routes_no_coverage = A3_analysis.comparison(all_routes, entire_tmobile)

In [None]:
entire_ATT = clip_no_coverage_files("ATT_no_coverage_d", "ATT")

In [None]:
gpd.read_parquet("ATT.parquet").plot()

In [None]:
gpd.read_parquet("gs://calitp-analytics-data/data-analyses/cellular_coverage/att_all_counties.parquet").plot()

In [None]:
gpd.read_parquet("gs://calitp-analytics-data/data-analyses/cellular_coverage/ATT_no_coverage_complete_CA.parquet").plot()

In [None]:
entire_ATT.plot()

#### Test Area

In [None]:
# Empty gdf
tmobile_gdf = pd.DataFrame()

In [None]:
for i in t_mobile_files:
    district_file_path = ''.join(district for district in i if district.isdigit())
    district_file_path = int(district_file_path)
    print(district_file_path)

In [None]:
# Test file path
no_coverage_file_path = 'gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_no_coverage_d1.parquet'

In [None]:
# Find which district a file belongs to 
district_file_path = ''.join(district for district in no_coverage_file_path if district.isdigit())

In [None]:
# Turn this into an integer
district_file_path = int(district_file_path)
district_file_path

In [None]:
# Open file path 
no_coverage_file = gpd.read_parquet(no_coverage_file_path)

In [None]:
# Take district_file_path and filter out district gdf
district_of_interest = (districts.loc[districts.district == district_file_path]).reset_index()

In [None]:
# Clip no coverage file against original district shape
file_clipped = gpd.clip(no_coverage_file, district_of_interest, keep_geom_type=False)

In [None]:
file_clipped.plot()

### D4 Verizon Test
* Test district 4 for Verizon

In [None]:
# Read in Verizon D4
v_d4 = gpd.read_parquet(
   f"{A1_provider_prep.GCS_FILE_PATH}Verizon_no_coverage_d4.parquet"
 )

In [None]:
# multi_dist_routes_dissolved.drop(columns = ["geometry"])

In [None]:
v_d4.plot()

In [None]:
# Get only d4
# d4 = (districts.loc[districts.district == 4]).reset_index()

In [None]:
# v_d4_clipped = gpd.clip(v_d4, d4, keep_geom_type=False)

In [None]:
# v_d4_clipped.plot()

In [None]:
"""n = v_d4_clipped.explore(
    color="red",
    width=800,
    height=400,
)"""

In [None]:
# Clipped geometry
# n = d4.explore(m=n, legend=False)

In [None]:
# 10:19 to 10:20
#routes_no_coverage = A2_analysis.comparison(
#        all_routes, v_d4_clipped
#    )

In [None]:
# routes_no_coverage.drop(columns = ["geometry"])

In [None]:
# routes_no_coverage_list = routes_no_coverage.long_route_name.unique().tolist()

In [None]:
# routes_no_coverage_og_geo = all_routes.loc[all_routes.long_route_name.isin(routes_no_coverage_list)]

In [None]:
# routes_no_coverage_og_geo = routes_no_coverage_og_geo.explore(width=800,
#     height=400, color= "red")

In [None]:
# routes_no_coverage_og_geo = v_d4_clipped.explore(m=routes_no_coverage_og_geo,)

In [None]:
# routes_no_coverage_og_geo