# Manually geocode some locations

* Some are valid, some require scouring Google Maps
* [Open data: SHN postmiles](https://gisdata-caltrans.opendata.arcgis.com/datasets/c22341fec9c74c6b9488ee4da23dd967_0/explore?location=37.180926%2C-119.281571%2C6.41
)

In [1]:
import geopandas as gpd
import pandas as pd

import utils
import shared_utils



In [2]:
df = pd.read_csv(f"{utils.GCS_FILE_PATH}manual_geocoding.csv")
print(f"# to manually geocode: {len(df)}")

# to manually geocode: 38


In [3]:
def clean_postmiles():

    '''
    df = shared_utils.utils.download_geoparquet( 
        GCS_FILE_PATH = f"{utils.GCS_FILE_PATH}", 
        FILE_NAME = "shn_postmiles"
    )
    '''
    df = gpd.read_parquet(f"{utils.DATA_PATH}shn_postmiles.parquet")
    
    # Round to 2 decimal places
    # otherwise, floats are giving trouble
    df = df.assign(
        PM = df.PM.round(2)
    ).rename(columns = {"County": "county"})
    
    return df


postmiles = clean_postmiles()

In [4]:
def subset_manual_geocoding(df):
    # These should be found in postmiles, with "Hwy X PM Y" pattern
    # Allow the ones with extra notes to stay, like "Hwy X PM Y ON NORTH WEED BLVD"
    
    df2 = df[(df.address.str.contains("PM")) & 
         (df.address.notna())][
        ["sheet_uuid", "address", "county"]]
    
    print(f"have postmiles: {len(df2)}")
    
    return df2

In [5]:
def parse_postmiles(df):
    df = df.assign(
        Route = (df.address.str.split(" PM ", expand=True)[0]
                        .str.replace("HWY", "").astype(int)
                       ),
        PM = (df.address.str.split(" PM ", expand=True)[1]),
    )
    
    df = df.assign(
        PM = df.PM.str.split(" ", expand=True)[0].astype(float).round(2)
    ) 
    
    return df

In [6]:
df2 = subset_manual_geocoding(df)
df3 = parse_postmiles(df2)

have postmiles: 31


In [7]:
df3.head()

Unnamed: 0,sheet_uuid,address,county,Route,PM
3,cdcfaf3a-6937-45c3-8eeb-56bbb739e098,HWY 89 PM 11.3,SIS,89,11.3
4,c437e971-0e17-4acd-b734-2731c1b1cfa6,HWY 299 PM 80.2,SHA,299,80.2
5,a954dadd-b40e-4314-ad41-cbcdb26c5c69,HWY 299 PM 20.3,MOD,299,20.3
6,c56b5067-7d22-4286-a554-dc9adb8628e2,HWY 89 PM 29.3,PLU,89,29.3
8,d5997658-8854-4959-a556-3a85d08b1806,HWY 299 PM 50.2,MOD,299,50.2


In [8]:
df4 = pd.merge(
    df3, 
    postmiles,
    on = ["Route", "PM", "county"],
    how = "left",
    validate = "1:m",
    indicator=True
)

In [9]:
# There are duplicates because there's a lat/lon for each direction (N/S, E/W)
# Take centroid vs keep one of the obs after explicitly sorting
# Either create new geometry or have a lat/lon that appears in postmiles df
def find_centroid(df):
    # The merge was left_only, and is df, not gdf
    gdf = df.set_geometry("geometry")
    
    # Dissolve by sheet_uuid, then calculate centroid
    gdf2 = (gdf.dissolve(by="sheet_uuid").centroid
            .reset_index()
            .rename(columns = {0: "geometry"})
           )
    
    # Merge back in to df
    gdf3 = pd.merge(
        gdf2, 
        df[["sheet_uuid", "address", "county"]].drop_duplicates(),
        on = "sheet_uuid",
        how = "left",
        validate = "1:1",
    )
    
    return gdf3

In [10]:
df5 = find_centroid(df4)




In [11]:
df5[df5.geometry.isna()]

Unnamed: 0,sheet_uuid,geometry,address,county
0,0233ab46-6158-426f-b0b9-ac6813277518,,HWY 88 PM 66.5,AMA
1,03f4cd85-b96d-44f8-8897-2e62ff09e8f2,,HWY 49 PM 8.107,MPA
2,18888079-f8bd-4b21-9f6f-455f2898cadc,,HWY 395 PM 1152,LAS
13,73745590-7323-44e7-850d-e4fd35ee7660,,HWY 44 PM 339,SHA
28,e9e1096a-fbad-4c23-9b65-7aef00f42fd3,,HWY 88 PM 134,ALP
29,f41a7f09-a727-4d1c-94ab-ddd467bdc8f0,,HWY 70 PM 707,PLU
30,fc149f97-cff4-4692-915c-5c1d559b0db9,,HWY 70 PM 552,PLU


In [12]:
# postmile adjustment
# sometimes rounding isn't the same, so it doesn't merge 
# correct these and they should merge on
PM_dict = {
    "HWY 88 PM 66.5": "66.54",
    "HWY 49 PM 8.107": "8.1",
    "HWY 395 PM 1152": "11.54",
    "HWY 44 PM 339": "33.9",
    "HWY 88 PM 134": "13.4",
    "HWY 70 PM 707": "70.7",
    "HWY 70 PM 552": "55.2",
}

In [13]:
#postmiles[postmiles.Route==70].PM.value_counts()

In [14]:
postmiles[(postmiles.Route==70) & (postmiles.county=="PLU") & 
          (postmiles.PM >55) & (postmiles.PM < 56)]

Unnamed: 0,OBJECTID,Route,RteSuffix,RouteS,PMRouteID,county,District,PMPrefix,PM,PMSuffix,PMc,Odometer,PMInterval,HwySegment,AlignCode,RouteType,Direction,PMoffset,geometry
130014,130015,70,,70,PLU070...L,PLU,2,,55.8,,55.8,136.453003,0.1,Mid Segment,Left,State,WB,55.8,POINT (-120.75152 39.87629)
130015,130016,70,,70,PLU070...L,PLU,2,,55.1,,55.1,135.753006,0.1,Mid Segment,Left,State,WB,55.1,POINT (-120.76372 39.87891)
130120,130121,70,,70,PLU070...L,PLU,2,,55.5,,55.5,136.153,0.1,Mid Segment,Left,State,WB,55.5,POINT (-120.75624 39.87851)
131388,131389,70,,70,PLU070...L,PLU,2,,55.2,,55.2,135.852997,0.1,Mid Segment,Left,State,WB,55.2,POINT (-120.76184 39.87884)
131670,131671,70,,70,PLU070...L,PLU,2,,55.3,,55.3,135.953003,0.1,Mid Segment,Left,State,WB,55.3,POINT (-120.75997 39.87877)
132518,132519,70,,70,PLU070...L,PLU,2,,55.4,,55.4,136.052994,0.1,Mid Segment,Left,State,WB,55.4,POINT (-120.75810 39.87870)
132738,132739,70,,70,PLU070...L,PLU,2,,55.7,,55.7,136.352997,0.1,Mid Segment,Left,State,WB,55.7,POINT (-120.75290 39.87726)
133264,133265,70,,70,PLU070...L,PLU,2,,55.6,,55.6,136.253006,0.1,Mid Segment,Left,State,WB,55.6,POINT (-120.75449 39.87801)
134307,134308,70,,70,PLU070...L,PLU,2,,55.9,,55.9,136.552994,0.1,Mid Segment,Left,State,WB,55.9,POINT (-120.75018 39.87528)
138317,138318,70,,70,PLU070...R,PLU,2,,55.8,,55.8,136.431,0.1,Mid Segment,Right,State,EB,55.8,POINT (-120.75152 39.87629)
