In [1]:
import os
from os import listdir
from os.path import isfile, join, splitext
import sys
import csv
import re

import numpy as np
import pandas as pd
import geopandas as gpd

import usaddress

import matplotlib.pyplot as plt

# Parsing with `usaddress` module

## Edge Cases

In [3]:
# Queens, NY address with dashes in street address
queens_ny_address = "89-22 197th St Unit 2, Hollis, NY 11423"
usaddress.tag(queens_ny_address)

(OrderedDict([('AddressNumber', '89-22'),
              ('StreetName', '197th'),
              ('StreetNamePostType', 'St'),
              ('OccupancyType', 'Unit'),
              ('OccupancyIdentifier', '2'),
              ('PlaceName', 'Hollis'),
              ('StateName', 'NY'),
              ('ZipCode', '11423')]),
 'Street Address')

In [4]:
# Southwest address with Spanish names
NM_address = "220 Camino Tres SW, Albuquerque, NM 87105"
usaddress.tag(NM_address)

(OrderedDict([('AddressNumber', '220'),
              ('StreetNamePreType', 'Camino'),
              ('StreetName', 'Tres'),
              ('StreetNamePostDirectional', 'SW'),
              ('PlaceName', 'Albuquerque'),
              ('StateName', 'NM'),
              ('ZipCode', '87105')]),
 'Street Address')

In [5]:
# grid system with no street type
UT_address = "921 3385 S, Millcreek, UT 84106"
usaddress.tag(UT_address)

(OrderedDict([('AddressNumber', '921'),
              ('StreetName', '3385'),
              ('StreetNamePostDirectional', 'S'),
              ('PlaceName', 'Millcreek'),
              ('StateName', 'UT'),
              ('ZipCode', '84106')]),
 'Street Address')

# Simulated Residential History Data

In [2]:
os.chdir('..')
abs_path = os.getcwd()
print(abs_path)

C:\Users\bchan\OneDrive - UW\CLAD\CLAD_Geospatial


In [3]:
link = os.path.join(abs_path, 'data', 'test_address_20k_v1.csv')

temp = pd.read_csv(link)
temp

Unnamed: 0,Full_Addre,source_lon,source_lat,geometry,user_id,start_date,end_duration,end_date
0,"2499 SOUTH WILMINGTON AVENUE, COMPTON, CA 90220",-118.236303,33.868428,POINT (-118.23630253521881 33.86842825057474),1,2004-12-09,847 days,2007-04-05
1,"420 South Pierre Street, Pierre, South Dakota ...",-98.695554,43.254257,POINT (-98.6955539999999 43.2542570000001),1,2006-02-04,527 days,2007-07-16
2,"12150 30 MILE ROAD, WASHINGTON, MI 48095",-83.013787,42.771167,POINT (-83.01378666886954 42.77116730707756),1,2018-07-12,949 days,2021-02-15
3,"65 TIFTON ELDORADO RD, TIFTON, GA 31794",-83.496325,31.445218,POINT (-83.49632463429873 31.445218156335972),2,2001-05-11,824 days,2003-08-13
4,"12512 WALTERS RD, HOUSTON, TX 77014",-95.468105,29.973519,POINT (-95.46810520662352 29.973518593722815),2,2004-01-09,924 days,2006-07-21
...,...,...,...,...,...,...,...,...
19995,"730 Reems Creek Road, Weaverville, NC 28787",-82.509238,35.694465,POINT (-82.50923810689038 35.6944646159304),4999,2006-04-17,628 days,2008-01-05
19996,"5793 South 700 East, Whitestown, IN 46075",-86.335915,39.955682,POINT (-86.33591469213522 39.95568248243973),4999,2010-05-21,463 days,2011-08-27
19997,"7150 E ARCH RD, STOCKTON, CA 95205",-121.187052,37.902070,POINT (-121.1870524708539 37.90207040341747),4999,2012-09-17,930 days,2015-04-05
19998,"1075 GARRISONVILLE ROAD, STAFFORD, VA 22556",-77.469689,38.474291,POINT (-77.46968895783967 38.474291425596675),4999,2016-07-19,727 days,2018-07-16


In [4]:
Pub28_usaddress_template = {
   'Recipient': 'recipient',
   'AddressNumber': 'address1',
   'AddressNumberPrefix': 'address1',
   'AddressNumberSuffix': 'address1',
   'StreetName': 'address1',
   'StreetNamePreDirectional': 'address1',
   'StreetNamePreModifier': 'address1',
   'StreetNamePreType': 'address1',
   'StreetNamePostDirectional': 'address1',
   'StreetNamePostModifier': 'address1',
   'StreetNamePostType': 'address1',
   'CornerOf': 'address1',
   'IntersectionSeparator': 'address1',
   'LandmarkName': 'address1',
   'USPSBoxGroupID': 'address1',
   'USPSBoxGroupType': 'address1',
   'USPSBoxID': 'address1',
   'USPSBoxType': 'address1',
   'BuildingName': 'address2',
   'OccupancyType': 'address2',
   'OccupancyIdentifier': 'address2',
   'SubaddressIdentifier': 'address2',
   'SubaddressType': 'address2',
   'PlaceName': 'city',
   'StateName': 'state',
   'ZipCode': 'zip_code',
}

In [19]:
# 1) identify unique address strings for location records
temp_drop = temp.drop_duplicates(subset='Full_Addre')

# 2) set up OMOP placeholder table
OMOP_location = pd.DataFrame(columns=['Location_id','address_1','address_2','city','state','zip','county',
                                      'location_source_value','latitude','longitude'])
                             
# need to change datatype for latitude and longitude
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude


In [20]:
# 3) assign address_strings to OMOP_location table location_source_value series
OMOP_location['location_source_value'] = temp_drop.Full_Addre
OMOP_location['Location_id'] = OMOP_location.index+1

# OMOP_location['Location_id'] = OMOP_location.re+1
OMOP_location.latitude = temp_drop.source_lat
OMOP_location.longitude = temp_drop.source_lon
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude
0,1,,,,,,,"2499 SOUTH WILMINGTON AVENUE, COMPTON, CA 90220",33.868428,-118.236303
1,2,,,,,,,"420 South Pierre Street, Pierre, South Dakota ...",43.254257,-98.695554
2,3,,,,,,,"12150 30 MILE ROAD, WASHINGTON, MI 48095",42.771167,-83.013787
3,4,,,,,,,"65 TIFTON ELDORADO RD, TIFTON, GA 31794",31.445218,-83.496325
4,5,,,,,,,"12512 WALTERS RD, HOUSTON, TX 77014",29.973519,-95.468105
...,...,...,...,...,...,...,...,...,...,...
19994,19995,,,,,,,"W1828 GROS CAP ROAD, SAINT IGNACE, MI 49781",45.870412,-84.823893
19995,19996,,,,,,,"730 Reems Creek Road, Weaverville, NC 28787",35.694465,-82.509238
19996,19997,,,,,,,"5793 South 700 East, Whitestown, IN 46075",39.955682,-86.335915
19997,19998,,,,,,,"7150 E ARCH RD, STOCKTON, CA 95205",37.902070,-121.187052


In [24]:
%%time
# 4) process address string through usaddress parser with Pub28 template
repo = pd.DataFrame()

# iter
for ind, each in OMOP_location.loc[:,['location_source_value']].drop_duplicates().iterrows():

    # try Pub28 parsing
    try:
        obj = usaddress.tag(each.location_source_value, tag_mapping=Pub28_usaddress_template)
        
        # staging
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]
        
        # development
        OMOP_location.loc[ind, 'address_1'] = tmp['address1'].values[0]
        OMOP_location.loc[ind, 'city'] = tmp['city'].values[0]
        OMOP_location.loc[ind, 'state'] = tmp['state'].values[0]
        OMOP_location.loc[ind, 'zip'] = tmp['zip_code'].values[0]
        OMOP_location.loc[ind, 'address_type']=tmp['Address_type'].values[0]

        address_2 = tmp['address2'].values[0]
        if len(address_2) >= 3:
            OMOP_location.loc[ind, 'address_2'] = address_2
        else:
            OMOP_location.loc[ind, 'address_2'] = np.NaN

        repo = repo.append(tmp)
    
    except:
        # print(ind, each.location_source_value)
        pass
   
    # if ind==15:
    #     break

CPU times: total: 22.3 s
Wall time: 31.3 s


In [25]:
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type
0,1,2499 SOUTH WILMINGTON AVENUE,,COMPTON,CA,90220,,"2499 SOUTH WILMINGTON AVENUE, COMPTON, CA 90220",33.868428,-118.236303,Street Address
1,2,420 South Pierre Street,,"Pierre, South",Dakota,57501,,"420 South Pierre Street, Pierre, South Dakota ...",43.254257,-98.695554,Street Address
2,3,12150 30 MILE ROAD,,WASHINGTON,MI,48095,,"12150 30 MILE ROAD, WASHINGTON, MI 48095",42.771167,-83.013787,Street Address
3,4,65 TIFTON ELDORADO RD,,TIFTON,GA,31794,,"65 TIFTON ELDORADO RD, TIFTON, GA 31794",31.445218,-83.496325,Street Address
4,5,12512 WALTERS RD,,HOUSTON,TX,77014,,"12512 WALTERS RD, HOUSTON, TX 77014",29.973519,-95.468105,Street Address
...,...,...,...,...,...,...,...,...,...,...,...
19994,19995,W1828 GROS CAP ROAD,,SAINT IGNACE,MI,49781,,"W1828 GROS CAP ROAD, SAINT IGNACE, MI 49781",45.870412,-84.823893,Street Address
19995,19996,730 Reems Creek Road,,Weaverville,NC,28787,,"730 Reems Creek Road, Weaverville, NC 28787",35.694465,-82.509238,Street Address
19996,19997,5793 South 700 East,,Whitestown,IN,46075,,"5793 South 700 East, Whitestown, IN 46075",39.955682,-86.335915,Street Address
19997,19998,7150 E ARCH RD,,STOCKTON,CA,95205,,"7150 E ARCH RD, STOCKTON, CA 95205",37.902070,-121.187052,Street Address


In [26]:
OMOP_location.loc[OMOP_location.address_2.notna()]

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type
15,16,4143 43RD ST,APT E8,SUNNYSIDE,NY,11104,,"4143 43RD ST APT E8, SUNNYSIDE, NY 11104",40.746160,-73.920907,Street Address
88,89,"1475 West Peachtree Street, Ne",Suite 200,Atlanta,Georgia,30309,,"1475 West Peachtree Street, Ne, Suite 200, Atl...",33.794396,-84.387514,Street Address
283,284,7800 East Orchard Road,Suite 200,Greenwood Village,Colorado,80111,,"7800 East Orchard Road Suite 200, Greenwood Vi...",38.618711,-94.681366,Street Address
373,374,26 DUBE LN,# 1,WATERBURY,CT,6705,,"26 DUBE LN # 1, WATERBURY, CT 6705",41.550205,-73.016571,Street Address
380,381,12700 Park Central Drive,Suite 1700,Dallas,Texas,75251,,"12700 Park Central Drive, Suite 1700, Dallas, ...",38.468992,-101.752463,Street Address
...,...,...,...,...,...,...,...,...,...,...,...
19572,19573,12 SAINT ANDREWS CIR,UNIT 4,WALLINGFORD,CT,6492,,"12 SAINT ANDREWS CIR UNIT 4, WALLINGFORD, CT 6492",41.421025,-72.820354,Street Address
19593,19594,1320 CENTRE ST,STE 306,NEWTON,MA,2459,,"1320 CENTRE ST STE 306, NEWTON, MA 2459",42.328529,-71.195926,Street Address
19717,19718,125 PRIVATE RD,4303,HONDO,TX,78861,,"125 PRIVATE RD 4303, HONDO, TX 78861",29.351890,-99.196148,Street Address
19959,19960,"STATE ROUTE 264, MILEPOST",396,KEAMS CANYON,AZ,86034,,"STATE ROUTE 264, MILEPOST 396, KEAMS CANYON, A...",35.812889,-110.197767,Ambiguous


# Post-Hoc Processing

In [27]:
# replace state full name to abbreviation
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

def multipleReplace(text, wordDict):
    for key in wordDict:
        text = text.replace(key, wordDict[key])
    return text

In [30]:
# replace full state names to abbreviations
OMOP_location['state_abbr'] = OMOP_location.state.apply(lambda x: multipleReplace(str(x).strip(), us_state_to_abbrev))
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,2499 SOUTH WILMINGTON AVENUE,,COMPTON,CA,90220,,"2499 SOUTH WILMINGTON AVENUE, COMPTON, CA 90220",33.868428,-118.236303,Street Address,CA
1,2,420 South Pierre Street,,"Pierre, South",Dakota,57501,,"420 South Pierre Street, Pierre, South Dakota ...",43.254257,-98.695554,Street Address,Dakota
2,3,12150 30 MILE ROAD,,WASHINGTON,MI,48095,,"12150 30 MILE ROAD, WASHINGTON, MI 48095",42.771167,-83.013787,Street Address,MI
3,4,65 TIFTON ELDORADO RD,,TIFTON,GA,31794,,"65 TIFTON ELDORADO RD, TIFTON, GA 31794",31.445218,-83.496325,Street Address,GA
4,5,12512 WALTERS RD,,HOUSTON,TX,77014,,"12512 WALTERS RD, HOUSTON, TX 77014",29.973519,-95.468105,Street Address,TX
...,...,...,...,...,...,...,...,...,...,...,...,...
19994,19995,W1828 GROS CAP ROAD,,SAINT IGNACE,MI,49781,,"W1828 GROS CAP ROAD, SAINT IGNACE, MI 49781",45.870412,-84.823893,Street Address,MI
19995,19996,730 Reems Creek Road,,Weaverville,NC,28787,,"730 Reems Creek Road, Weaverville, NC 28787",35.694465,-82.509238,Street Address,NC
19996,19997,5793 South 700 East,,Whitestown,IN,46075,,"5793 South 700 East, Whitestown, IN 46075",39.955682,-86.335915,Street Address,IN
19997,19998,7150 E ARCH RD,,STOCKTON,CA,95205,,"7150 E ARCH RD, STOCKTON, CA 95205",37.902070,-121.187052,Street Address,CA


In [37]:
# capitalize only first letter
OMOP_location['address_1'] = OMOP_location.address_1.apply(lambda x: str(x).strip().title())
OMOP_location['address_2'] = OMOP_location.address_2.apply(lambda x: str(x).strip().title() if not np.NaN else x)
OMOP_location['city'] = OMOP_location.city.apply(lambda x: str(x).strip().title())

In [38]:
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,2499 South Wilmington Avenue,,Compton,CA,90220,,"2499 SOUTH WILMINGTON AVENUE, COMPTON, CA 90220",33.868428,-118.236303,Street Address,CA
1,2,420 South Pierre Street,,"Pierre, South",Dakota,57501,,"420 South Pierre Street, Pierre, South Dakota ...",43.254257,-98.695554,Street Address,Dakota
2,3,12150 30 Mile Road,,Washington,MI,48095,,"12150 30 MILE ROAD, WASHINGTON, MI 48095",42.771167,-83.013787,Street Address,MI
3,4,65 Tifton Eldorado Rd,,Tifton,GA,31794,,"65 TIFTON ELDORADO RD, TIFTON, GA 31794",31.445218,-83.496325,Street Address,GA
4,5,12512 Walters Rd,,Houston,TX,77014,,"12512 WALTERS RD, HOUSTON, TX 77014",29.973519,-95.468105,Street Address,TX
...,...,...,...,...,...,...,...,...,...,...,...,...
19994,19995,W1828 Gros Cap Road,,Saint Ignace,MI,49781,,"W1828 GROS CAP ROAD, SAINT IGNACE, MI 49781",45.870412,-84.823893,Street Address,MI
19995,19996,730 Reems Creek Road,,Weaverville,NC,28787,,"730 Reems Creek Road, Weaverville, NC 28787",35.694465,-82.509238,Street Address,NC
19996,19997,5793 South 700 East,,Whitestown,IN,46075,,"5793 South 700 East, Whitestown, IN 46075",39.955682,-86.335915,Street Address,IN
19997,19998,7150 E Arch Rd,,Stockton,CA,95205,,"7150 E ARCH RD, STOCKTON, CA 95205",37.902070,-121.187052,Street Address,CA
