# Geocode addresses

In [None]:
#!pip install geocoder

In [1]:
import geocoder
import intake
import pandas as pd
import re

import utils

import json
from calitp.storage import get_fs
fs = get_fs()

DATA_PATH = "./data/"
GCS_FILE_PATH = utils.GCS_FILE_PATH

catalog = intake.open_catalog("./*.yml")

In [2]:
df = catalog.tier1_facilities_addresses.read()

In [3]:
def clean_up_addresses(row):
    if row.address is not None:
        address_string = row.address

        # Replace everything within parentheses
        address_cleaned = re.sub(r'\([^)]*\)', '', address_string)

        # They prefix some locations with these
        remove_me = ["TMC, ", "Lab, ", "Space adjustment: " ]

        for word in remove_me:
            address_cleaned = address_cleaned.replace(word, "")
            
        # Strip extra leading or trailing spaces
        address_cleaned = address_cleaned.strip()
        
        return address_cleaned
    else:
        return None 

In [4]:
def assemble_full_address(row):
    if (row.address_cleaned is not None) and (row.city is not None):
        address_city = row.address_cleaned + " " + row.city
        address_with_zip = address_city + ", CA " + str(row.zip_code)

        if row.category=="office":
            full_address = address_city + ", CA"
        elif ((row.category=="maintenance") and (row.zip_code2 is None)):
            full_address = address_with_zip
        elif (row.category=="maintenance") and (row.zip_code2 is not None):
            full_address = address_with_zip + "-" + row.zip_code2
        elif (row.category=="equipment") or (row.category=="labs"):
            full_address = address_with_zip
    else:
        full_address = "Incomplete"
    return full_address

In [5]:
def prep_for_geocoding(df):
    df = df.assign(
        address_cleaned = df.apply(lambda x: clean_up_addresses(x), axis=1)
    )
    df = df.assign(
        full_address = df.apply(lambda x: assemble_full_address(x), axis=1)
    )
    return df

In [6]:
df = prep_for_geocoding(df)

In [7]:
# Do these manually
# if there are no digits, in addition to those we know are incomplete
manual_geocoding = df[~(df.full_address.str.contains(r"[0-9]"))]

# These can go into a geocoder
for_geocoding = df[(df.full_address.str.contains(f"[0-9]"))]    

In [8]:
# Now that we removed the stuff in parentheses, 
# the same location comes up multiple times
# just throw 1 into geocoder, merge it back in to full df later
keep_cols = ["full_address", "city", "zip_code", "sheet_uuid"]
for_geocoding2 = for_geocoding[keep_cols].drop_duplicates(
    # Keep sheet_uuid to cache json results and have an identifier for the name
    # but don't use it to merge it back in
    subset=["full_address", "city", "zip_code"])

In [9]:
for_geocoding2.head()

Unnamed: 0,full_address,city,zip_code,sheet_uuid
0,"1656 Union Street Eureka, CA",Eureka,,8505301e-b065-4f1f-90f5-bf4e90a3db32
3,"1835 6th Street Eureka, CA",Eureka,,bbcdcce6-83d1-4cae-af70-739dcfedec3b
4,"1657 Riverside Drive Redding, CA",Redding,,2d6a6032-69aa-41e5-8ab5-d2c1fe95be25
7,"1031 Butte Street Redding, CA",Redding,,31342fd9-c25a-4b62-a164-9c3574329bad
8,"703 B Street Marysville, CA",Marysville,,310e148a-0e75-4512-b4a9-f35d5bb12cd8


In [10]:
def save_request_json(my_dict, name, 
                      DATA_PATH = DATA_PATH, 
                      GCS_FILE_PATH = GCS_FILE_PATH):    
    # Convert to json
    #https://gist.github.com/romgapuz/c7a4cedb85f090ac1b55383a58fa572c
    json_obj = json.loads(json.dumps(my_dict, default=str))
    
    # Save json locally
    json.dump(json_obj, open(f"{DATA_PATH}{name}.json", "w", encoding='utf-8'))
    
    # Put the json object in GCS. 
    fs.put(f"{DATA_PATH}{name}.json", f"{GCS_FILE_PATH}{name}.json")
    print(f"Saved {name}")
    
    
def open_request_json(name, DATA_PATH = DATA_PATH, 
                       GCS_FILE_PATH = GCS_FILE_PATH):
    # Download object from GCS bucket
    gcs_json = fs.get(f"{GCS_FILE_PATH}{name}.json", f"{DATA_PATH}{name}.json")
    my_dict = json.load(open(f"{DATA_PATH}{name}.json"))
    
    return my_dict

In [11]:
def geocode_address(row):
    input_address = row.full_address
    
    g = geocoder.osm(input_address)
    # results are a dict with x, y, address components
    # keep it all, since we don't always have zip_code
    # also use this as sanity check
    results = g.osm
    
    save_request_json(results, row.sheet_uuid, DATA_PATH = DATA_PATH, 
                      GCS_FILE_PATH = f"{utils.GCS_FILE_PATH}geocode_cache/")
    print(f"Cached {row.sheet_uuid}")

    def compile_results(results):
        longitude = results["x"]
        latitude = results["y"]
        house_number = results["addr:housenumber"]
        street = results["addr:street"]
        city = results["addr:state"]
        state = results["addr:state"]
        country = results["addr:country"]
        postal = results["addr:postal"]

        return pd.Series(
            [longitude, latitude, 
             house_number, street,
             city, state, country, postal], 
            index= ["longitude", "latitude", 
                    "house_number", "street",
                    "city", "state", "country", "postal"]
        )
    try:
        geocoded_results = compile_results(results)
        print(f"Finished compiling {row.sheet_uuid}")
        return geocoded_results
    except:
        pass
        print(f"Unable to compile {row.sheet_uuid}")

In [12]:
geocoded_columns = for_geocoding2.apply(lambda x: geocode_address(x), axis=1)

Saved 8505301e-b065-4f1f-90f5-bf4e90a3db32
Cached 8505301e-b065-4f1f-90f5-bf4e90a3db32
Finished compiling 8505301e-b065-4f1f-90f5-bf4e90a3db32
Saved bbcdcce6-83d1-4cae-af70-739dcfedec3b
Cached bbcdcce6-83d1-4cae-af70-739dcfedec3b
Finished compiling bbcdcce6-83d1-4cae-af70-739dcfedec3b
Saved 2d6a6032-69aa-41e5-8ab5-d2c1fe95be25
Cached 2d6a6032-69aa-41e5-8ab5-d2c1fe95be25
Finished compiling 2d6a6032-69aa-41e5-8ab5-d2c1fe95be25
Saved 31342fd9-c25a-4b62-a164-9c3574329bad
Cached 31342fd9-c25a-4b62-a164-9c3574329bad
Finished compiling 31342fd9-c25a-4b62-a164-9c3574329bad
Saved 310e148a-0e75-4512-b4a9-f35d5bb12cd8
Cached 310e148a-0e75-4512-b4a9-f35d5bb12cd8
Finished compiling 310e148a-0e75-4512-b4a9-f35d5bb12cd8
Saved d19d0008-a974-4358-8cc2-9ed31b0de9db
Cached d19d0008-a974-4358-8cc2-9ed31b0de9db
Finished compiling d19d0008-a974-4358-8cc2-9ed31b0de9db
Saved 44a9f43f-e895-4c11-b1b2-8be8b8a7f575
Cached 44a9f43f-e895-4c11-b1b2-8be8b8a7f575
Finished compiling 44a9f43f-e895-4c11-b1b2-8be8b8a7f575

In [None]:
geocoded_results = pd.concat([for_geocoding2, geocoded_columns], axis=1)

In [None]:
# TODO: Unable to compile results and put into another df to merge
# take cached results, download, and see if it can be read in as df 
# then do pd.concat

In [None]:
test_address = for_geocoding2.full_address.iloc[0]
print(test_address)

In [None]:
g = geocoder.osm(test_address)

In [None]:
# this is a dictionary

print(g.osm)
'''
{'x': -124.17505328571428, 
'y': 40.79185285714286, 
'addr:housenumber': '1656', 
'addr:street': 'Union Street', 
'addr:city': 'Eureka', 
'addr:state': 'California', 
'addr:country': 'United States', 
'addr:postal': '95501'}

results = g.osm
'''

In [None]:
'''
pd.concat([
    test_df, 
    pd.DataFrame.from_dict(results, orient="index").T], axis=1
    )
'''

In [None]:
# apply to row, then return columns for each address row
# concat 2 dfs together
#geocoded_results = pd.concat([test_df, res1], axis=1)

In [None]:
test_df.sheet_uuid[0]

In [None]:

save_request_json(results, test_df.sheet_uuid[0], DATA_PATH, GCS_FILE_PATH)