# Debug Tmobile sjoin + overlay

`overlay` is taking a long time, but since there's a dissolve, it's basically 1 row for T-Mobile compared to 1 row for the district boundary. Might be able to treat them as GeoSeries and see if it can go through faster.

`difference` can also take multiple rows for T-Mobile compared to 1 row for district boundary. In this case, the dissolve for where there's no coverage for T-Mobile should take place after the difference is taken.

* `difference`: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.difference.html


In [None]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd
from shared_utils import geography_utils
from shared_utils import utils

from calitp.sql import to_snakecase
import utilities

In [None]:
counties = utilities.get_counties()

# Read in original CT map
ct_all_districts = to_snakecase(
        gpd.read_file(f"{utilities.caltrans_shape}").to_crs(epsg=4326)
    )[["district", "geometry"]]
    

In [None]:
# Original map - includes other stuff that isn't California
tmobile = dg.read_parquet(
    "gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_california.parquet"
)

##### Sjoin  for T-Mobile by Districts
* Do a sjoin to only grab rows of T-Mobile's gdf that overlap with Caltrans districts before `find difference and clip`.
* Counties didn't seem to work, that's why I went with districts.
* Done: Results are split between 3 parquets.
    * `gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_d1_d4.parquet` 
    * `gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_d5_d8.parquet`
    * `gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_d9_d12.parquet`
    

* Took about 3 minutes to finish running.

#### iloc function

In [None]:
def iloc_find_difference(
    provider_df: dg.GeoDataFrame, 
    district_df: gpd.GeoDataFrame,
    provider_name: str,
) -> dg.GeoDataFrame:
    
    # Clip provider to CT district
    provider_district = dg.sjoin(
        provider_df, 
        district_df, 
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")
    
    # Compute back to normal gdf
    provider_district = provider_district.compute()
    
    # Stash intermediate output here 
    d = provider_district.district.iloc[0]
    utils.geoparquet_gcs_export(provider_district, utilities.GCS_FILE_PATH, f"{provider_name}_d{d}")
    print(f"saved {provider_name}_d{d} parquet") 
    
    # Get areas without coverage
    no_coverage = provider_district.difference(
        district_df.geometry.iloc[0], 
    ).reset_index()
    
    # Turn to gdf
    no_coverage = (no_coverage.reset_index()
                  .dissolve()
                  .rename(columns = {0: 'geometry'})
                  [["geometry"]]
                 )
    # Set geometry
    no_coverage = no_coverage.set_geometry('geometry')
    
    utils.geoparquet_gcs_export(no_coverage, utilities.GCS_FILE_PATH, f"{provider_name}_no_coverage_d{d}")
    
    print(f"{provider_name}_no_coverage_d{d} parquet")
    
    return no_coverage

In [None]:
# districts = [*range(1, 13, 1)]

In [None]:
# [*range(1, 13, 1)]

In [None]:
def complete_difference_provider(
    provider_df: dg.GeoDataFrame, 
    district_df: gpd.GeoDataFrame,
    provider_name: str) -> dg.GeoDataFrame:
    
    full_gdf = pd.DataFrame()
    
    for i in [*range(1, 13, 1)]:
        result = iloc_find_difference(
            provider_df, 
            district_df[district_df.district==i],
            provider_name
        )

        full_gdf = dd.multi.concat([full_gdf, result], axis=0)
    
    full_gdf = full_gdf.compute()
    
    utils.geoparquet_gcs_export(full_gdf, utilities.GCS_FILE_PATH, f"{provider_name}_no_coverage_complete_CA")
    return full_gdf

In [None]:
test = complete_difference_provider(tmobile[["geometry"]], ct_all_districts,'tmobile')

In [None]:
# For files that are split apart by counties/districts
# That needs to be combined together to create a full California map
# https://www.geeksforgeeks.org/how-to-read-multiple-data-files-into-pandas/
def concat_all_areas(file_list:list, provider_name:str):
    main_dataframe = pd.DataFrame(gpd.read_parquet(file_list[0]))
    for i in range(1,len(file_list)):
        data = pd.gpd.read_parquet(file_list[i])
        df = pd.DataFrame(data)
        full_dataframe = dd.multi.concat([full_dataframe,df],axis=1)
        
    full_dataframe = full_dataframe.compute()
    
    utils.geoparquet_gcs_export(full_gdf, utilities.GCS_FILE_PATH, f"{provider_name}_no_coverage_complete_CA")
    return full_dataframe

In [None]:
test_tmobile = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/cellular_coverage/tmobile_no_coverage_complete_CA.parquet"
)

In [None]:
test_tmobile.plot()

In [None]:
# Blue parts are areas with coverage in D5. 
"""
tmobile_district = dg.sjoin(
    tmobile[["geometry"]], 
    ct_all_districts[ct_all_districts.district==5],
    how="inner", 
    predicate="intersects"
).drop(columns = "index_right")"""

In [None]:
tmobile_d5 = tmobile_district.compute()

In [None]:
len(tmobile_d5)

In [None]:
tmobile_d5.head(1)

In [None]:
type(tmobile_d5)

In [None]:
# Getting areas without coverage in D2. 
#no_coverage = tmobile_d5.difference(
  #      ct_all_districts[ct_all_districts.district==5].geometry.iloc[0], 
   # ).reset_index()

In [None]:
# This line grabs the actual polygon 
# ct_all_districts[ct_all_districts.district==2].geometry.iloc[0]

In [None]:
no_coverage_d5 = (no_coverage.reset_index()
                  .dissolve()
                  .rename(columns = {0: 'geometry'})
                  [["geometry"]]
                 )

In [None]:
no_coverage_d5 = no_coverage_d5.set_geometry('geometry')

In [None]:
type(no_coverage_d5), no_coverage_d5.columns

In [None]:
utils.geoparquet_gcs_export(no_coverage_d5, utilities.GCS_FILE_PATH, f"test_d5")

In [None]:
test_d5 = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/cellular_coverage/test_d5.parquet"
)

In [None]:
type(test_d5)

In [None]:
test_d5.plot()

#### Break apart dissolve

In [None]:
ct_d5  = (ct_all_districts[ct_all_districts.district==5]

In [None]:
ct_d5

In [None]:
ct_all_districts[ct_all_districts.district==5]

In [None]:
ct_d5.plot()

In [None]:
tmobile_district = dg.sjoin(
        tmobile[['geometry']], 
        ct_d5, 
        how="inner", 
        predicate="intersects"
    )

In [None]:
tmobile_district_gdf = tmobile_district.compute()

In [None]:
len(tmobile_district_gdf)

In [None]:
tmobile_district_gdf.head(1)

In [None]:
"""
tmobile_5 = (tmobile_district.drop(columns = "index_right").dissolve(by="district")
                    .reset_index()
                    .compute()
                   )"""

In [None]:
len(tmobile_5)

In [None]:
district_file = tmobile_5.district.iloc[0]

In [None]:
district_file

In [None]:
tmobile_5

In [None]:
# utils.geoparquet_gcs_export(tmobile_5, utilities.GCS_FILE_PATH,f"tmobile_d{district_file}.parquet")

In [None]:
tmobile_align_true = (tmobile_5.difference(
            (ct_all_districts[ct_all_districts.district==5]).reset_index(), 
            align=True
        ).reset_index()
        .rename(columns = {0: 'geometry'})
    )   

In [None]:
type(tmobile_align_true)

In [None]:
tmobile_align_true = tmobile_align_true.set_geometry('geometry')

In [None]:
# I get an error when plotting /opt/conda/lib/python3.9/site-packages/geopandas/array.py:938: RuntimeWarning: All-NaN slice encountered
tmobile_align_true.plot()

In [None]:
def sjoin_to_district_find_difference(
    tmobile: dg.GeoDataFrame, 
    district_df: gpd.GeoDataFrame
) -> dg.GeoDataFrame:
    tmobile_district = dg.sjoin(
        tmobile, 
        district_df, 
        how="inner", 
        predicate="intersects"
    ).drop(columns = "index_right")
    
    # Dissolve so that it's just 1 row for TMobile coverage in that district
    tmobile_diss = (tmobile_district.dissolve(by="district")
                    .reset_index()
                    .compute()
                   )
    
    # Maybe stash intermediate output here?
    d = district_df.district.iloc[0]
    tmobile_diss.to_parquet(f"tmobile_d{d}.parquet")
    
    # Instead of gpd.overlay(), since tmobile_diss is 1 row and 
    # district_df is 1 row,
    # can treat them as GeoSeries, align=True, to do row-wise comparison
    no_coverage = (tmobile_diss.difference(
            ct_all_districts[ct_all_districts.district==i], 
            align=True
        ).reset_index()
        .rename(columns = {0: 'geometry'})
    )   
    
    no_coverage.to_parquet(f"no_coverage_d{d}.parquet")
    
    return no_coverage

In [None]:
results = []

for i in [1]:
    dissolved_result = sjoin_to_district_find_difference(
        tmobile[["geometry"]], 
        ct_all_districts[ct_all_districts.district==i]
    )
    
    results.append(dissolved_result)

In [None]:
ddf = dd.multi.concat(results, axis=0)

In [None]:
df = ddf.compute()