# Manually geocode some locations

* Some are valid, some require scouring Google Maps
* [Open data: SHN postmiles](https://gisdata-caltrans.opendata.arcgis.com/datasets/c22341fec9c74c6b9488ee4da23dd967_0/explore?location=37.180926%2C-119.281571%2C6.41
)

In [1]:
import geopandas as gpd
import pandas as pd

import utils
import shared_utils



In [2]:
df = pd.read_csv(f"{utils.GCS_FILE_PATH}manual_geocoding.csv")
print(f"# to manually geocode: {len(df)}")

# to manually geocode: 38


In [3]:
def clean_postmiles():

    '''
    df = shared_utils.utils.download_geoparquet( 
        GCS_FILE_PATH = f"{utils.GCS_FILE_PATH}", 
        FILE_NAME = "shn_postmiles"
    )
    '''
    df = gpd.read_parquet(f"{utils.DATA_PATH}shn_postmiles.parquet")
    
    # Round to 2 decimal places
    # otherwise, floats are giving trouble
    df = df.assign(
        PM = df.PM.round(2)
    ).rename(columns = {"County": "county"})
    
    return df


postmiles = clean_postmiles()

In [4]:
def subset_manual_geocoding(df):
    # These should be found in postmiles, with "Hwy X PM Y" pattern
    # Allow the ones with extra notes to stay, like "Hwy X PM Y ON NORTH WEED BLVD"
    
    df2 = df[(df.address.str.contains("PM")) & 
         (df.address.notna())][
        ["sheet_uuid", "address", "county"]]
    
    print(f"have postmiles: {len(df2)}")
    
    return df2

In [5]:
def parse_postmiles(df):
    # postmile adjustment
    # sometimes rounding isn't the same, so it doesn't merge 
    # correct these and they should merge on
    PM_dict = {
        "HWY 88 PM 66.5": "HWY 88 PM 66.54",
        "HWY 49 PM 8.107": "HWY 49 PM 8.1",
        "HWY 395 PM 1152": "HWY 395 PM 11.54",
        "HWY 44 PM 339": "HWY 44 PM 33.9",
        "HWY 88 PM 134": "HWY 88 PM 13.4",
        "HWY 70 PM 707": "HWY 70 PM 70.7",
        "HWY 70 PM 552": "HWY 70 PM 55.2",
    }
    
    df = df.assign(
        address = df.apply(lambda x: PM_dict[x.address] if x.address in PM_dict.keys()
                           else x.address, axis=1)
    )
    
    df = df.assign(
        Route = (df.address.str.split(" PM ", expand=True)[0]
                        .str.replace("HWY", "").astype(int)
                       ),
        PM = (df.address.str.split(" PM ", expand=True)[1]),
    )
    
    df = df.assign(
        PM = df.PM.str.split(" ", expand=True)[0].astype(float).round(2)
    ) 
    
    return df

In [6]:
df2 = subset_manual_geocoding(df)
df3 = parse_postmiles(df2)

have postmiles: 31


In [8]:
df4 = pd.merge(
    df3, 
    postmiles,
    on = ["Route", "PM", "county"],
    how = "left",
    validate = "1:m",
    indicator=True
)

In [9]:
df4._merge.value_counts()

both          64
left_only      0
right_only     0
Name: _merge, dtype: int64

In [10]:
# There are duplicates because there's a lat/lon for each direction (N/S, E/W)
# Take centroid vs keep one of the obs after explicitly sorting
# Either create new geometry or have a lat/lon that appears in postmiles df
def find_centroid(df):
    # The merge was left_only, and is df, not gdf
    gdf = df.set_geometry("geometry")
    
    # Dissolve by sheet_uuid, then calculate centroid
    gdf2 = (gdf.dissolve(by="sheet_uuid").centroid
            .reset_index()
            .rename(columns = {0: "geometry"})
           )
    
    # Merge back in to df
    gdf3 = pd.merge(
        gdf2, 
        df[["sheet_uuid", "address", "county"]].drop_duplicates(),
        on = "sheet_uuid",
        how = "left",
        validate = "1:1",
    )
    
    return gdf3

In [11]:
df5 = find_centroid(df4)




In [14]:
df5[df5.geometry.isna()]

Unnamed: 0,sheet_uuid,geometry,address,county


In [13]:
len(df5)

31