### Cleaning up Survey123 Geometry


* Monica's geojsons into our GCS?
* It would be great to save a parquet with one row per project element and a common project id.
* https://pypi.org/project/fs-gcsfs/
* Pip install `pip install fs-gcsfs` and `calitp_data_infra`

In [None]:
import geopandas as gpd
import pandas as pd
from shared_utils import utils, geography_utils

In [None]:
from calitp_data_analysis import get_fs
fs = get_fs()
import os
import _utils
import fiona

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

#### Geometry Intake geojson
* Same thing as layer 0 below

In [None]:
def to_snakecase(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_') 
    return df

In [None]:
geo_path = "TCEP_SCCP_GeometryIntake_geojson_20230801.zip"

In [None]:
#with get_fs().open(f"{TCEP_SCCP_GCS}{geo_path}") as f:
#     tcep_sccp_geo = to_snakecase(gpd.read_file(f))


In [None]:
# tcep_sccp_geo.sample()

In [None]:
#tcep_sccp_geo.shape

In [None]:
#tcep_sccp_geo.project_name.nunique()

In [None]:
#tcep_sccp_geo.drop(columns = ['geometry'])

In [None]:
# tcep_sccp_geo.sort_values(by = ['project_name'])[['project_name','which_type_of_infrastructure_does_the_geometry_above_correspond_to?','geometry']]

In [None]:
# tcep_sccp_geo.explore(style_kwds = {'weight':5}, height = 400, width = 1000)

#### Geometry Scores
* https://stackoverflow.com/questions/64277987/python-geopandas-failing-to-read-misread-750mb-zip-esri-gdb-file-but-not-200mb
* https://fiona.readthedocs.io/en/latest/README.html
* https://fiona.readthedocs.io/en/stable/manual.html

In [None]:
def open_survey123(save_to_gsc:bool = False):
    # https://gis.stackexchange.com/questions/255138/reading-the-names-of-geodatabase-file-layers-in-python
    file = "TCEP_SCCP_Score_Geometry_20230801.gdb.zip"
    GCS_PATH = f"{_utils.GCS_FILE_PATH}Survey123_Geo/"
    fs.get(f'{GCS_PATH}{file}', 'tmp.gdb.zip')
    geo_layers = fiona.listlayers('tmp.gdb.zip')
    
    print(f"layers = {geo_layers}")
    
    gdf = pd.DataFrame()
    
    for i in geo_layers:
        temp = to_snakecase(gpd.read_file('tmp.gdb.zip', layer = i))
        gdf = pd.concat([gdf, temp], axis=0)
        
    print("invalid geo rows:")
    display(gdf[~gdf.geometry.is_valid][['lyr','projname','geometry','geopoint_comments']])
    
    print("repeated geos rows:")
    
    repeated_cols = ['geometry','parentglobalid','projname','creator']
    repeated_geo = (gdf
                .groupby(repeated_cols)
                .agg({'editor':'count'})
                .reset_index()
               ) 
    
    repeated_geo = repeated_geo.loc[repeated_geo.editor > 1]
    display(repeated_geo) 
    
    
    # Keep only valid geometries
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop = True)
    gdf = gdf.drop(columns = ['creationdate', 'editdate'])
    
    # Drop duplicates
    #gdf = gdf.drop_duplicates(subset = repeated_cols)
    
    gdf = gdf.fillna(gdf.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    
    # Save to GCS
    if save_to_gsc == True:
        utils.geoparquet_gcs_export(gdf, GCS_PATH, "cleaned_survey123_sample13")
        
    return gdf

In [None]:
all_results = open_survey123(False)

In [None]:
all_results.projname.nunique()

In [None]:
all_results.geopoint_type.value_counts()

In [None]:
projects_list = all_results.projname.sort_values().unique()

In [None]:
all_results.groupby(['projname']).agg({'lyr':'count'}).sort_values('lyr')

In [None]:
# all_results[cols].explore('projname', cmap='tab10', style_kwds = {'weight':5}, height = 400, width = 1000, legend = True)

In [None]:
def preview_one_project(project_name:str):
    one_project = all_results.loc[all_results.projname == project_name]
    map_cols = ['geometry','parentglobalid','geopoint_type','geopoint_type_existing','geopoint_comments']
    display(one_project[map_cols].explore('geopoint_type', cmap='tab10', style_kwds = {'weight':6}, height = 400, width = 1000, legend = True))
    drop_cols = ['parentglobalid','lyr_globalid','editor','shape_length','geometry']
    one_project = one_project.sort_values(by = ['projname']).drop(columns = drop_cols)
    print(f"{len(one_project)} geometries")
    display(one_project)

In [None]:
# preview_one_project("National Highway Freight Network Improvement Program - State Route 47-Seaside Avenue & Navy Way Interchange Improvement Project")

In [None]:
# preview_one_project('Watsonville-Santa Cruz Multimodal Corridor Program')

In [None]:
cols = ['parentglobalid','projname','geopoint_type','geopoint_type_existing','geopoint_comments','geometry']

In [None]:
def preview_one_geotype_route(project_name:str, geopoint_type:str):
    
    map_cols = ['lyr','lyr_globalid','geopoint_type','geopoint_type_existing','geopoint_comments','geometry']
    one_project = all_results.loc[(all_results.projname == project_name) & (all_results.geopoint_type == geopoint_type)]
    display(one_project[map_cols].explore('lyr_globalid', cmap='tab10', style_kwds = {'weight':6}, height = 400, width = 1000, legend = True))
    

In [None]:
all_results[cols].loc[all_results.parentglobalid== "{4D60FABF-CDFB-4C4A-870E-DC8F29664447}"]

In [None]:
preview_one_geotype_route('Fix 5 Cascade Gateway','ITS')

In [None]:
preview_one_project('Fix 5 Cascade Gateway')

In [None]:
preview_one_project('U.S. 101 Connected Communities Corridor Rail and Active Transportation Improvements')

In [None]:
preview_one_project('Fix 5 Cascade Gateway')

### GCS

In [None]:
test_geoparquet = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/project_prioritization/Survey123_Geo/cleaned_survey123_sample13.parquet")

In [None]:
test_geoparquet.shape

In [None]:
test_geoparquet.projname.nunique()

In [None]:
test_geoparquet.explore('projname')