In [1]:
import os
from os import listdir
from os.path import isfile, join, splitext
import sys
import csv
import re

import numpy as np
import pandas as pd
import geopandas as gpd
import spacy

import usaddress

# Parsing with `usaddress` module

## Edge Cases

In [2]:
# Queens, NY address with dashes in street address
queens_ny_address = "89-22 197th St Unit 2, Hollis, NY 11423"
usaddress.tag(queens_ny_address)

(OrderedDict([('AddressNumber', '89-22'),
              ('StreetName', '197th'),
              ('StreetNamePostType', 'St'),
              ('OccupancyType', 'Unit'),
              ('OccupancyIdentifier', '2'),
              ('PlaceName', 'Hollis'),
              ('StateName', 'NY'),
              ('ZipCode', '11423')]),
 'Street Address')

In [3]:
# Southwest address with Spanish names
NM_address = "220 Camino Tres SW, Albuquerque, NM 87105"
usaddress.tag(NM_address)

(OrderedDict([('AddressNumber', '220'),
              ('StreetNamePreType', 'Camino'),
              ('StreetName', 'Tres'),
              ('StreetNamePostDirectional', 'SW'),
              ('PlaceName', 'Albuquerque'),
              ('StateName', 'NM'),
              ('ZipCode', '87105')]),
 'Street Address')

In [4]:
# grid system with no street type
UT_address = "921 3385 S, Millcreek, UT 84106"
usaddress.tag(UT_address)

(OrderedDict([('AddressNumber', '921'),
              ('StreetName', '3385'),
              ('StreetNamePostDirectional', 'S'),
              ('PlaceName', 'Millcreek'),
              ('StateName', 'UT'),
              ('ZipCode', '84106')]),
 'Street Address')

# Simulated Residential History Data
* USPS Publication 28 Standard

In [5]:
os.chdir('..')
abs_path = os.getcwd()
print(abs_path)

C:\Users\bchan\OneDrive - UW\CLAD\CLAD_Geospatial


In [6]:
# import cleaned HIFLD addresses
link = os.path.join(abs_path, 'output', 'sample_spatial_join_OMOP_clean.csv')

temp = pd.read_csv(link)
temp

  temp = pd.read_csv(link)


Unnamed: 0,Full_Addre,Place_type,source_lon,source_lat,geometry,index_right,OBJECTID,GEOID_1,NAME_1,NAMELSAD_1,...,Shape_Leng,Shape_Le_1,Shape_Area,Tribal,address_1,address_2,city,state,zip,STATE
0,"523 E BROADWAY, SOUTH BOSTON, MA 02127",AllPlacesOfWorship,-71.043522,42.335472,POINT (-71.04352199999346 42.33547200002667),,,,,,...,,,,0,523 E BROADWAY,,SOUTH BOSTON,MA,2127,MA
1,"454 ESSEX ST, LAWRENCE, MA 01840",AllPlacesOfWorship,-71.164940,42.706213,POINT (-71.16493999969576 42.70621300000711),,,,,,...,,,,0,454 ESSEX ST,,LAWRENCE,MA,1840,MA
2,"569 BROADWAY, NEWARK, NJ 07104",AllPlacesOfWorship,-74.162821,40.769935,POINT (-74.16282099988595 40.76993499987205),,,,,,...,,,,0,569 BROADWAY,,NEWARK,NJ,7104,NJ
3,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",AllPlacesOfWorship,-78.747816,42.798535,POINT (-78.74781599997843 42.79853499984358),,,,,,...,,,,0,3210 SOUTHWESTERN BLVD,,ORCHARD PARK,NY,14127,NY
4,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",AllPlacesOfWorship,-70.113777,44.428610,POINT (-70.11377699970232 44.42860999977688),,,,,,...,,,,0,431 CAMPGROUND RD,,LIVERMORE FLS,ME,4254,ME
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104097,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",UrgentCareFacs,-84.095174,34.041727,POINT (-84.09517421962246 34.04172694900986),,,,,,...,,,,0,1300 PEACHTREE INDUSTRIAL BOULEVARD,,SUWANEE,GA,30024,GA
104098,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",UrgentCareFacs,-84.101318,33.966797,POINT (-84.10131848955332 33.96679709299809),,,,,,...,,,,0,2660 SATELLITE BOULEVARD NORTHWEST,,DULUTH,GA,30096,GA
104099,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",UrgentCareFacs,-83.902215,34.068832,POINT (-83.90221543184028 34.06883234729177),,,,,,...,,,,0,3685 BRASELTON HIGHWAY,,DACULA,GA,30019,GA
104100,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",UrgentCareFacs,-84.494106,37.996508,POINT (-84.49410603994914 37.99650802456039),,,,,,...,,,,0,1055 DOVE RUN ROAD,,LEXINGTON,KY,40502,KY


In [7]:
# USPS Publication 28 Standard dictionary for `usaddress`
Pub28_usaddress_template = {
   'Recipient': 'recipient',
   'AddressNumber': 'address1',
   'AddressNumberPrefix': 'address1',
   'AddressNumberSuffix': 'address1',
   'StreetName': 'address1',
   'StreetNamePreDirectional': 'address1',
   'StreetNamePreModifier': 'address1',
   'StreetNamePreType': 'address1',
   'StreetNamePostDirectional': 'address1',
   'StreetNamePostModifier': 'address1',
   'StreetNamePostType': 'address1',
   'CornerOf': 'address1',
   'IntersectionSeparator': 'address1',
   'LandmarkName': 'address1',
   'USPSBoxGroupID': 'address1',
   'USPSBoxGroupType': 'address1',
   'USPSBoxID': 'address1',
   'USPSBoxType': 'address1',
   'BuildingName': 'address2',
   'OccupancyType': 'address2',
   'OccupancyIdentifier': 'address2',
   'SubaddressIdentifier': 'address2',
   'SubaddressType': 'address2',
   'PlaceName': 'city',
   'StateName': 'state',
   'ZipCode': 'zip_code',
}

In [8]:
# 1) identify unique address strings for location records
temp_drop = temp.drop_duplicates(subset='Full_Addre')

# 2) set up OMOP placeholder table
OMOP_location = pd.DataFrame(columns=['Location_id','address_1','address_2','city','state','zip','county',
                                      'location_source_value','latitude','longitude'])
                             
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude


In [9]:
# 3) assign address_strings to OMOP_location table with location_source_value series
OMOP_location['location_source_value'] = temp_drop.Full_Addre

# 4) set unique location ID for each address
OMOP_location['Location_id'] = OMOP_location.index+1

# 5) assign the source latitutde and longitude for each address
OMOP_location.latitude = temp_drop.source_lat
OMOP_location.longitude = temp_drop.source_lon
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude
0,1,,,,,,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522
1,2,,,,,,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940
2,3,,,,,,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821
3,4,,,,,,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816
4,5,,,,,,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777
...,...,...,...,...,...,...,...,...,...,...
104097,104098,,,,,,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174
104098,104099,,,,,,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318
104099,104100,,,,,,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215
104100,104101,,,,,,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106


## Parse OMOP Components

**USPS Publication 28 Standard**
  * address_1
  * address_2
  * city
  * state
  * zip
  * county
  * location_source_value
  * latitude
  * longitude

In [11]:
%%time
# 6) process address string through usaddress parser with USPS Pub28 template
repo = pd.DataFrame()

# iterate over OMOP_location
for ind, each in OMOP_location.loc[:,['location_source_value']].drop_duplicates().iterrows():

    # try parsing with usaddress parser using the USPS Pub28 template
    try:
        obj = usaddress.tag(each.location_source_value, tag_mapping=Pub28_usaddress_template)
        
        # staging
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]
        
        # development
        OMOP_location.loc[ind, 'address_1'] = tmp['address1'].values[0]
        OMOP_location.loc[ind, 'city'] = tmp['city'].values[0]
        OMOP_location.loc[ind, 'state'] = tmp['state'].values[0]
        OMOP_location.loc[ind, 'zip'] = tmp['zip_code'].values[0]
        OMOP_location.loc[ind, 'address_type']=tmp['Address_type'].values[0]

        address_2 = tmp['address2'].values[0]
        if len(address_2) >= 3:
            OMOP_location.loc[ind, 'address_2'] = address_2
        else:
            OMOP_location.loc[ind, 'address_2'] = np.NaN

        repo = repo.append(tmp)
    
    except:
        # print(ind, each.location_source_value)
        pass


CPU times: total: 9min 5s
Wall time: 13min 14s


In [12]:
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type
0,1,523 E BROADWAY,,SOUTH BOSTON,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address
1,2,454 ESSEX ST,,LAWRENCE,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address
2,3,569 BROADWAY,,NEWARK,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address
3,4,3210 SOUTHWESTERN BLVD,,ORCHARD PARK,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address
4,5,431 CAMPGROUND RD,,LIVERMORE FLS,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address
...,...,...,...,...,...,...,...,...,...,...,...
104097,104098,1300 PEACHTREE INDUSTRIAL BOULEVARD,,SUWANEE,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address
104098,104099,2660 SATELLITE BOULEVARD NORTHWEST,,DULUTH,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address
104099,104100,3685 BRASELTON HIGHWAY,,DACULA,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address
104100,104101,1055 DOVE RUN ROAD,,LEXINGTON,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address


In [13]:
# check that address_2 parsed correctly
OMOP_location.loc[OMOP_location.address_2.notna()]

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type
6,7,337 STATE STREET,SUITE 3,AUGUSTA,ME,04330,,"337 STATE STREET SUITE 3, AUGUSTA, ME 04330",44.297293,-69.783541,Street Address
79,80,75 MORTON VILLAGE DR,APT 408,MATTAPAN,MA,02126,,"75 MORTON VILLAGE DR APT 408, MATTAPAN, MA 02126",42.280395,-71.085149,Street Address
81,82,200 HANCOCK ST,APT 904,BANGOR,ME,04401,,"200 HANCOCK ST APT 904, BANGOR, ME 04401",44.802406,-68.762050,Street Address
130,131,430 GRANDVIEW AVE,APT 17,BANGOR,ME,04401,,"430 GRANDVIEW AVE APT 17, BANGOR, ME 04401",44.827785,-68.781941,Street Address
187,188,33 GLENWOOD DR,APT 2,BANGOR,ME,04401,,"33 GLENWOOD DR APT 2, BANGOR, ME 04401",44.823394,-68.795611,Street Address
...,...,...,...,...,...,...,...,...,...,...,...
100989,100990,1998 UNITED STATES HIGHWAY,62 412,HIGHLAND,AR,72542,,"1998 UNITED STATES HIGHWAY 62 412, HIGHLAND, A...",36.261774,-91.526763,Street Address
101221,101222,19422 UNITED STATES HIGHWAY,281,"NORTH, SAN ANTONIO",TX,78258,,"19422 UNITED STATES HIGHWAY 281 NORTH, SAN ANT...",29.621960,-98.463141,Street Address
102819,102820,9511 UNITED STATES HIGHWAY,431,ALBERTVILLE,AL,35950,,"9511 UNITED STATES HIGHWAY 431, ALBERTVILLE, A...",34.288373,-86.241246,Street Address
103263,103264,10935 SOUTH UNITED STATES HIGHWAY,15 501,SOUTHERN PINES,NC,28387,,"10935 SOUTH UNITED STATES HIGHWAY 15 501, SOUT...",35.162689,-79.420724,Street Address


# Post-Hoc Processing
1) Replace full name states to abbreviation
2) Capitalize only the first letter of each parsed component

In [14]:
# dictionary for state full name to abbreviation
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

def multipleReplace(text, wordDict):
    for key in wordDict:
        text = text.replace(key, wordDict[key])
    return text

In [17]:
# eliminate whitespaces from end of `state`
OMOP_location['state'] = OMOP_location.state.apply(lambda x: str(x).rstrip())

# replace full state names to abbreviations
OMOP_location['state_abbr'] = OMOP_location.state.apply(lambda x: multipleReplace(str(x).strip(), us_state_to_abbrev))
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,523 E BROADWAY,,SOUTH BOSTON,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA
1,2,454 ESSEX ST,,LAWRENCE,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA
2,3,569 BROADWAY,,NEWARK,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ
3,4,3210 SOUTHWESTERN BLVD,,ORCHARD PARK,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY
4,5,431 CAMPGROUND RD,,LIVERMORE FLS,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME
...,...,...,...,...,...,...,...,...,...,...,...,...
104097,104098,1300 PEACHTREE INDUSTRIAL BOULEVARD,,SUWANEE,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA
104098,104099,2660 SATELLITE BOULEVARD NORTHWEST,,DULUTH,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA
104099,104100,3685 BRASELTON HIGHWAY,,DACULA,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA
104100,104101,1055 DOVE RUN ROAD,,LEXINGTON,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY


In [18]:
# capitalize only first letter
OMOP_location['address_1'] = OMOP_location.address_1.apply(lambda x: str(x).strip().title())
OMOP_location['address_2'] = OMOP_location.address_2.apply(lambda x: str(x).strip().title() if not np.NaN else x)
OMOP_location['city'] = OMOP_location.city.apply(lambda x: str(x).strip().title())

OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME
...,...,...,...,...,...,...,...,...,...,...,...,...
104097,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA
104098,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA
104099,104100,3685 Braselton Highway,,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA
104100,104101,1055 Dove Run Road,,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY


In [19]:
OMOP_location_path = os.path.join(abs_path, 'output', 'OMOP_location.csv')

OMOP_location.to_csv(OMOP_location_path, index=False)

# Data Quality Check
* address_1 should not be null and must be alphanumeric combination row-wise, may contain special characters
* address_2 can be null, otherwise alphanumeric combination
* city cannot have non-alphanumeric characters
* state must be 2-character abbreviation
* filter out non-street addresses by `address_type`

1) Parse with `usaddress` library
2) Data quality check above
3) Run custom parser function

* TO DO
  * Fix `West Virginia` state abbreviation

## Parse Failed Addresses with Custom Parser

In [20]:
# OMOP_location_path = pd.read_csv(os.path.join(abs_path, 'output', 'OMOP_location.csv'))
OMOP_location = pd.read_csv(os.path.join(abs_path, 'output', 'OMOP_location.csv'))

  OMOP_location = pd.read_csv(os.path.join(abs_path, 'output', 'OMOP_location.csv'))


In [22]:
def parse_address(df,
                  address_col,
                  state_full_pattern,
                  state_abbr_pattern,
                  zip_code_pattern=r"[0-9]{5}(?:-[0-9]{4})?"
                  # state_code_pattern=r"[A-Z][a-z]+(?: +[A-Z][a-z]+)*)"
                 ):
    """
    Parse full address string to OMOP components by Regex search

    Parameters
    ----------
    df (DataFrame): Pandas DataFrame of source centroids with dates and user IDs

    Returns
    -------
    parse_df (DataFrame): DataFrame with parsed OMOP address components
    """
    tmp = []
    for i,row in df.iterrows():
        addr_components = row[address_col].split(',')

        # parse address if no RegEx match for 'APT'
        if len(re.findall(r'APT', row[address_col], flags=re.IGNORECASE)) == 0:
            state_zip = addr_components[2].split(' ')
            if len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0:
                row['address_1'] = addr_components[0]
                row['address_2'] = np.NaN
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]
    
                tmp.append(row)   
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                row['address_1'] = addr_components[0]
                row['address_2'] = np.NaN
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = np.NaN
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0 and len(re.findall(state_full_pattern, addr_components[-1].title())) > 0:
                row['address_1'] = addr_components[0]
                row['address_2'] = np.NaN
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                row['address_1'] = addr_components[0]
                row['address_2'] = np.NaN
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())
                row['zip'] = np.NaN
    
                tmp.append(row)
                
        # # parse address if RegEx match for 'APT' to address_1 & address_2
        elif len(re.findall(r'APT', row[address_col], flags=re.IGNORECASE)) > 0:
            if len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0:
                base_address = addr_components[0]
                apt_string = re.findall(r'APT', row.Full_Address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(apt_string)[0]
                row['address_2'] = base_address.partition(apt_string)[1] + base_address.partition(apt_string)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                base_address = addr_components[0]
                apt_string = re.findall(r'APT', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(apt_string)[0]
                row['address_2'] = base_address.partition(apt_string)[1] + base_address.partition(apt_string)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = np.NaN
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0 and len(re.findall(state_full_pattern, addr_components[-1].title())) > 0:
                base_address = addr_components[0]
                apt_string = re.findall(r'APT', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(apt_string)[0]
                row['address_2'] = base_address.partition(apt_string)[1] + base_address.partition(apt_string)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                base_address = addr_components[0]
                apt_string = re.findall(r'APT', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(apt_string)[0]
                row['address_2'] = base_address.partition(apt_string)[1] + base_address.partition(apt_string)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())
                row['zip'] = np.NaN
    
                tmp.append(row)
                    
        # # parse address if RegEx match for 'Suite' to to address_1 & address_2
        elif len(re.findall(r'SUITE', row[address_col], flags=re.IGNORECASE)) > 0:
            if len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0:
                base_address = addr_components[0]
                suite_str = re.findall(r'SUITE', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(suite_str)[0]
                row['address_2'] = base_address.partition(suite_str)[1] + base_address.partition(suite_str)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]

                tmp.append(row)

            elif len(re.findall(state_abbr_pattern, addr_components[-1])) > 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                base_address = addr_components[0]
                suite_str = re.findall(r'SUITE', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(suite_str)[0]
                row['address_2'] = base_address.partition(suite_str)[1] + base_address.partition(suite_str)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_abbr_pattern, addr_components[-1])[0]
                row['zip'] = np.NaN
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) > 0 and len(re.findall(state_full_pattern, addr_components[-1].title())) > 0:
                base_address = addr_components[0]
                suite_str = re.findall(r'SUITE', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(suite_str)[0]
                row['address_2'] = base_address.partition(suite_str)[1] + base_address.partition(suite_str)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())[0]
                row['zip'] = re.findall(zip_code_pattern, addr_components[-1])[0]
    
                tmp.append(row)
    
            elif len(re.findall(state_abbr_pattern, addr_components[-1])) == 0 and len(re.findall(zip_code_pattern, addr_components[-1])) == 0:
                base_address = addr_components[0]
                suite_str = re.findall(r'SUITE', base_address, flags=re.IGNORECASE)[0]
                row['address_1'] = base_address.partition(suite_str)[0]
                row['address_2'] = base_address.partition(suite_str)[1] + base_address.partition(suite_str)[2]
                row['city'] = addr_components[1]
                row['state'] = re.findall(state_full_pattern, addr_components[-1].title())
                row['zip'] = np.NaN
    
                tmp.append(row)        
        
    return pd.DataFrame(tmp)

In [23]:
# get failed addresses that do not have correct state abbreviation
OMOP_location['state_abbr'] = OMOP_location['state_abbr'].astype(str)
OMOP_state_failed = OMOP_location.loc[OMOP_location.state_abbr.str.len() > 2]
# state_condition = np.where(OMOP_location['state_abbr'].str.len() > 2)
# OMOP_state_failed = OMOP_location.loc[OMOP_location.state_abbr.map(lambda x: ]
OMOP_state_failed

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
220,222,Nan,,Nan,,,,"LOCAL, KINGMAN, ME 04451",45.639023,-68.263641,,
552,562,Nan,,Nan,,,,"LOCAL, KINGFIELD, ME 04947",44.960341,-70.159052,,
751,763,Nan,,Nan,,,,"LOCAL, BROOKLINE, NH 03033",42.736111,-71.663431,,
1057,1074,Nan,,Nan,,,,"LOCAL, WONALANCET, NH 03897",43.908447,-71.344152,,
1212,1232,Nan,,Nan,,,,"GENERAL DELIVERY, N HARTLAND, VT 05052",43.591020,-72.356611,,
...,...,...,...,...,...,...,...,...,...,...,...,...
87378,95865,Nan,,Nan,,,,"25 MI N OF GALLUP NM HWY 491, TOHATCHI, NM 87325",35.848832,-108.745451,,
88117,96831,Nan,,Nan,,,,"BLDG 25000TH KAFB EAST, ALBUQUERQUE, NM 87116",35.058916,-106.555920,,
89081,98095,Nan,,Nan,,,,"DEPT 3374, LARAMIE, WY 82071",41.313964,-105.584531,,
89500,98668,Nan,,Nan,,,,COLLEGE OF SOUTHERN IDAHO EVERGREEN BLDG. ROOM...,42.579849,-114.473937,,


In [24]:
%%time
# run custom parser on failed addresses above

state_full_pattern = r"/AL|Alabama|AK|Alaska|AZ|Arizona|AR|Arkansas|CA|California|CO|Colorado|CT|Connecticut|DE|Delaware|FL|Florida|GA|Georgia|HI|Hawaii|ID|Idaho|IL|Illinois|IN|Indiana|IA|Iowa|KS|Kansas|KY|Kentucky|LA|Louisiana|ME|Maine|MD|Maryland|MA|Massachusetts|MI|Michigan|MN|Minnesota|MS|Mississippi|MO|Missouri|MT|Montana|NE|Nebraska|NV|Nevada|NH|New Hampshire|NJ|New Jersey|NM|New Mexico|NY|New York|NC|North Carolina|ND|North Dakota|OH|Ohio|OK|Oklahoma|OR|Oregon|PA|Pennsylvania|RI|Rhode Island|SC|South Carolina|SD|South Dakota|TN|Tennessee|TX|Texas|UT|Utah|VT|Vermont|VA|Virginia|WA|Washington|WV|West Virginia|WI|Wisconsin|WY|Wyoming/"
state_abbr_pattern = re.compile(r'\b(AZ|CA|...|NJ|N\.J\.|NM|N\.M\.|...)\b')
state_abbr_case = r"^([Aa][LKSZRAEPlkszraep]|[Cc][AOTaot]|[Dd][ECec]|[Ff][LMlm]|[Gg][AUau]|[Hh][Ii]|[Ii][ADLNadln]|[Kk][SYsy]|[Ll][Aa]|[Mm][ADEHINOPSTadehinopst]|[Nn][CDEHJMVYcdehjmvy]|[Oo][HKRhkr]|[Pp][ARWarw]|[Rr][Ii]|[Ss][CDcd]|[Tt][NXnx]|[Uu][Tt]|[Vv][AITait]|[Ww][AIVYaivy])$"
zip_code_pattern=r"[0-9]{5}(?:-[0-9]{4})?"

new_df = parse_address(df=OMOP_state_failed,
                       address_col = 'location_source_value',
                       state_full_pattern=state_full_pattern, 
                       state_abbr_pattern=state_abbr_pattern)

new_df

CPU times: total: 875 ms
Wall time: 1.25 s


Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
220,222,LOCAL,,KINGMAN,ME,04451,,"LOCAL, KINGMAN, ME 04451",45.639023,-68.263641,,
552,562,LOCAL,,KINGFIELD,ME,04947,,"LOCAL, KINGFIELD, ME 04947",44.960341,-70.159052,,
751,763,LOCAL,,BROOKLINE,NH,03033,,"LOCAL, BROOKLINE, NH 03033",42.736111,-71.663431,,
1057,1074,LOCAL,,WONALANCET,NH,03897,,"LOCAL, WONALANCET, NH 03897",43.908447,-71.344152,,
1212,1232,GENERAL DELIVERY,,N HARTLAND,VT,05052,,"GENERAL DELIVERY, N HARTLAND, VT 05052",43.591020,-72.356611,,
...,...,...,...,...,...,...,...,...,...,...,...,...
87378,95865,25 MI N OF GALLUP NM HWY 491,,TOHATCHI,NM,87325,,"25 MI N OF GALLUP NM HWY 491, TOHATCHI, NM 87325",35.848832,-108.745451,,
88117,96831,BLDG 25000TH KAFB EAST,,ALBUQUERQUE,NM,87116,,"BLDG 25000TH KAFB EAST, ALBUQUERQUE, NM 87116",35.058916,-106.555920,,
89081,98095,DEPT 3374,,LARAMIE,WY,82071,,"DEPT 3374, LARAMIE, WY 82071",41.313964,-105.584531,,
89500,98668,COLLEGE OF SOUTHERN IDAHO EVERGREEN BLDG. ROOM 38,,TWIN FALLS,ID,83301,,COLLEGE OF SOUTHERN IDAHO EVERGREEN BLDG. ROOM...,42.579849,-114.473937,,


In [25]:
# replace full state name to abbreviations for above addresses
new_df['state_abbr'] = new_df.state.apply(lambda x: multipleReplace(str(x).strip(), us_state_to_abbrev))
new_df

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
220,222,LOCAL,,KINGMAN,ME,04451,,"LOCAL, KINGMAN, ME 04451",45.639023,-68.263641,,ME
552,562,LOCAL,,KINGFIELD,ME,04947,,"LOCAL, KINGFIELD, ME 04947",44.960341,-70.159052,,ME
751,763,LOCAL,,BROOKLINE,NH,03033,,"LOCAL, BROOKLINE, NH 03033",42.736111,-71.663431,,NH
1057,1074,LOCAL,,WONALANCET,NH,03897,,"LOCAL, WONALANCET, NH 03897",43.908447,-71.344152,,NH
1212,1232,GENERAL DELIVERY,,N HARTLAND,VT,05052,,"GENERAL DELIVERY, N HARTLAND, VT 05052",43.591020,-72.356611,,VT
...,...,...,...,...,...,...,...,...,...,...,...,...
87378,95865,25 MI N OF GALLUP NM HWY 491,,TOHATCHI,NM,87325,,"25 MI N OF GALLUP NM HWY 491, TOHATCHI, NM 87325",35.848832,-108.745451,,NM
88117,96831,BLDG 25000TH KAFB EAST,,ALBUQUERQUE,NM,87116,,"BLDG 25000TH KAFB EAST, ALBUQUERQUE, NM 87116",35.058916,-106.555920,,NM
89081,98095,DEPT 3374,,LARAMIE,WY,82071,,"DEPT 3374, LARAMIE, WY 82071",41.313964,-105.584531,,WY
89500,98668,COLLEGE OF SOUTHERN IDAHO EVERGREEN BLDG. ROOM 38,,TWIN FALLS,ID,83301,,COLLEGE OF SOUTHERN IDAHO EVERGREEN BLDG. ROOM...,42.579849,-114.473937,,ID


In [55]:
# update OMOP_location with addresses parsed with custom parser
OMOP_location_updated = new_df.combine_first(OMOP_location)
OMOP_location_updated

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME
...,...,...,...,...,...,...,...,...,...,...,...,...
94671,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024.0,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA
94672,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096.0,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA
94673,104100,3685 Braselton Highway,,Dacula,GA,30019.0,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA
94674,104101,1055 Dove Run Road,,Lexington,KY,40502.0,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY


In [56]:
def zip_leadingZeros(x):
    x.rjust(5, '0')

In [57]:
# pad leading zero to zipcode if not 5-digits
OMOP_location_updated['zip'] = OMOP_location_updated.zip.apply(lambda x: str(x).zfill(5))

# remove trailing zeros and decimal point
OMOP_location_updated['zip'] = OMOP_location_updated.zip.apply(lambda x: x.rstrip(".0") if ".0" in x else x)

OMOP_location_updated

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME
...,...,...,...,...,...,...,...,...,...,...,...,...
94671,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA
94672,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA
94673,104100,3685 Braselton Highway,,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA
94674,104101,1055 Dove Run Road,,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY


## Classify Failures
* 0: SUCCESSFUL GEOCODE
* 1: FAILED GEOCODE (UNSPECIFIED)
* 2:  FAILED DUE TO PO BOX ADDRESS (CANNOT BE RECOVERED FROM FAILURE)
* 3:  FAILED GEOCODE (UNSPECIFIED) AND EXCLUDED DUE TO NON-WASHINGTON STATE AND NON-WASHINGTON ZIP CODE ENCODING
* 4:  FAILED DUE TO RETURNED / UNDELIVERABLE / HOMELESS ANNOTATION [CANNOT BE RECOVERED FROM FAILURE]
* 5:  FAILED DUE TO INCOMPLETE ADDRESS (ADDRESS LINES WERE EITHER ALL NUMBERS OR ALL LETTERS)  [CANNOT BE RECOVERED FROM FAILURE]
* 6:  FAILED BECAUSE STREET ADDRESS IN LINE1 IS FLIPPED WITH LINE 2 (e.g., LINE 1 is the apartment number and LINE 2 has street address) 
* 7:  FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS
* 8:  FAILED DUE TO ADDRESS LINES CONTAINING NAMES or MAILBOX NUMBERS

In [18]:
# IGNORE THIS CELL
def custom_flag(x):
    """
    Post-hoc data quality check of parsed addresses
    """  
    # if 'PO' in str(x.address_1) or 'P.O.' in str(x.address_1):
    # if 'PO' in str(x.address_1).strip() or 'P.O.' in str(x.address_1):
    if re.match('PO', x.address_1, re.IGNORECASE):
        return 'FAILED DUE TO PO BOX ADDRESS'
    elif re.match('APT', x.address_1, re.IGNORECASE) or re.match('SUITE', x.address_1, re.IGNORECASE):
        return 'FAILED DUE TO STREET ADDRESS IN LINE_1 IS FLIPPED WITH LINE_2'
    elif not x.address_1[0].isdigit():
        return 'FAILED DUE TO STREET ADDRESS STARTS WITH LETTER'
    # check address_1 only contains alphanumeric characters (spaces are ok)
    elif any(not c.isalnum() and not c.isspace() for c in x.address_1):
        return 'FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS'
    elif len(x.state_abbr) > 2:
        return 'FAILED DUE TO INCORRECT STATE FORMAT'
    elif x[['address_1', 'city', 'state', 'zip']].isnull().any():
        return 'FAILED DUE TO INCOMPLETE PARSING'
    else:
        return 'SUCCESSFUL ADDRESS'

In [54]:
%%time

# IGNORE THIS CELL
OMOP_location_updated['flag'] = OMOP_location_updated.apply(lambda x: custom_flag(x), axis=1)

OMOP_location_updated

CPU times: total: 33.7 s
Wall time: 40.3 s


Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag
0,1,523 E Broadway,523 E Broadway,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS
1,2,454 Essex St,454 Essex St,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS
2,3,569 Broadway,569 Broadway,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS
3,4,3210 Southwestern Blvd,3210 Southwestern Blvd,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS
4,5,431 Campground Rd,431 Campground Rd,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94671,104098,1300 Peachtree Industrial Boulevard,1300 Peachtree Industrial Boulevard,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS
94672,104099,2660 Satellite Boulevard Northwest,2660 Satellite Boulevard Northwest,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS
94673,104100,3685 Braselton Highway,3685 Braselton Highway,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS
94674,104101,1055 Dove Run Road,1055 Dove Run Road,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS


In [150]:
# IGNORE THIS CELL
OMOP_location_updated.flag.value_counts()

flag
SUCCESSFUL ADDRESS                                 12879
FAILED DUE TO STREET ADDRESS STARTS WITH LETTER      821
FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS         588
FAILED DUE TO INCORRECT STATE FORMAT                  11
FAILED DUE TO PO BOX ADDRESS                          11
FAILED DUE TO INCOMPLETE PARSING                       1
Name: count, dtype: int64

In [152]:
OMOP_location_updated_path = os.path.join(abs_path, 'output', 'OMOP_location_flagged.csv')

OMOP_location_updated.to_csv(OMOP_location_updated_path, index=False)

In [58]:
# USE THIS TO FLAG ADDRESSES
def custom_flag(x):
    """
    Post-hoc data quality check of parsed addresses
    """  
    # check if PO box address
    if 'PO' in str(x.address_1) or 'PO' in str(x.address_2)\
        or 'P.O' in str(x.address_1) or 'P.O' in str(x.address_2)\
        or 'P O' in str(x.address_1) or 'P O' in str(x.address_2)\
        or 'PSC' in str(x.address_1) or 'PSC' in str(x.address_2)\
        or 'PNB' in str(x.address_1) or 'PNB' in str(x.address_2)\
        or 'PMB' in str(x.address_1) or 'PMB' in str(x.address_2):
        return 'FAILED DUE TO PO BOX ADDRESS'
    # check if address_2 and address_1 are flipped
    elif re.match('APT', x.address_1, re.IGNORECASE) or re.match('SUITE', x.address_1, re.IGNORECASE):
        return 'FAILED DUE TO STREET ADDRESS IN LINE_1 IS FLIPPED WITH LINE_2'
    # check if street address starts with a non-digit character
    elif not x.address_1[0].isdigit():
        return 'FAILED DUE TO STREET ADDRESS STARTS WITH LETTER'
    # check address_1 only contains alphanumeric characters (spaces are ok)
    elif any(not c.isalnum() and not c.isspace() for c in x.address_1):
        return 'FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS'
    # check if parsed 'state' component matches a US state or territory abbreviation
    elif len(x.state_abbr) > 2:
        return 'FAILED DUE TO INCORRECT STATE FORMAT'
    # check if any of the required address components did not parse from the full address
    elif x[['address_1', 'city', 'state', 'zip']].isnull().any():
        return 'FAILED DUE TO INCOMPLETE PARSING'
    elif len(x.zip) != 5:
        return 'FAILED DUE TO NON 5-DIGIT ZIPCODE'
    else:
        return 'SUCCESSFUL ADDRESS'

In [59]:
OMOP_location_updated['address_1'] = OMOP_location_updated['address_1'].astype(str)
OMOP_location_updated['address_2'] = OMOP_location_updated['address_2'].astype(str)

In [60]:
%%time
OMOP_location_updated['flag'] = OMOP_location_updated.apply(lambda x: custom_flag(x), axis=1)

OMOP_location_updated

CPU times: total: 31.5 s
Wall time: 40.6 s


Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94671,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS
94672,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS
94673,104100,3685 Braselton Highway,,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS
94674,104101,1055 Dove Run Road,,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS


In [61]:
OMOP_location_updated.flag.value_counts()

flag
SUCCESSFUL ADDRESS                                               78800
FAILED DUE TO STREET ADDRESS STARTS WITH LETTER                   9007
FAILED DUE TO NON 5-DIGIT ZIPCODE                                 3570
FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS                      3039
FAILED DUE TO PO BOX ADDRESS                                       226
FAILED DUE TO INCORRECT STATE FORMAT                                33
FAILED DUE TO STREET ADDRESS IN LINE_1 IS FLIPPED WITH LINE_2        1
Name: count, dtype: int64

In [62]:
# filter only 'SUCCESSFUL ADDRESS' flag
OMOP_location_updated_success = OMOP_location_updated.loc[OMOP_location_updated.flag == 'SUCCESSFUL ADDRESS']
OMOP_location_updated_success

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94671,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS
94672,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS
94673,104100,3685 Braselton Highway,,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS
94674,104101,1055 Dove Run Road,,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS


In [63]:
OMOP_location_updated_path = os.path.join(abs_path, 'output', 'OMOP_location_flagged_successful.csv')

OMOP_location_updated_success.to_csv(OMOP_location_updated_path, index=False)

In [64]:
OMOP_location_updated_success = pd.read_csv(os.path.join(abs_path, 'output', 'OMOP_location_flagged_successful.csv'))
OMOP_location_updated_success

  OMOP_location_updated_success = pd.read_csv(os.path.join(abs_path, 'output', 'OMOP_location_flagged_successful.csv'))


Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag
0,1,523 E Broadway,,South Boston,MA,02127,,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS
1,2,454 Essex St,,Lawrence,MA,01840,,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS
2,3,569 Broadway,,Newark,NJ,07104,,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS
3,4,3210 Southwestern Blvd,,Orchard Park,NY,14127,,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS
4,5,431 Campground Rd,,Livermore Fls,ME,04254,,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,GA,30024,,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,GA,30096,,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS
78797,104100,3685 Braselton Highway,,Dacula,GA,30019,,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS
78798,104101,1055 Dove Run Road,,Lexington,KY,40502,,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS


In [31]:
pd.DataFrame(OMOP_location_updated_success.state_abbr.sort_values().value_counts())

Unnamed: 0_level_0,count
state_abbr,Unnamed: 1_level_1
SD,8401
CA,6526
CO,6029
AZ,5495
OR,4289
NE,3384
UT,2758
NM,2396
FL,2363
NV,2307


# Random Sample by US State and Territories

In [27]:
# drop US territories that don't have at least 10 addresses
territories_drop = ['MP', 'VI', 'PW', 'Of', 'PR', 'GU']

dropped_territories = OMOP_location_updated_success.loc[OMOP_location_updated_success.state_abbr.isin(territories_drop)]
OMOP_location_updated_success_drop = OMOP_location_updated_success.loc[~OMOP_location_updated_success.state_abbr.isin(territories_drop)]

print(OMOP_location_updated_success_drop.shape)

(12859, 13)


In [28]:
# groupby state and randomly sample 10 addresses from each
df_sample = OMOP_location_updated_success_drop.groupby('state_abbr').apply(lambda x: x.sample(n=10)).reset_index(drop = True)
df_sample

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag
0,9981,339 East Dogwood Avenue,,Palmer,AK,99645,,"339 EAST DOGWOOD AVENUE, PALMER, AK 99645",61.602250,-149.110053,Street Address,AK,SUCCESSFUL ADDRESS
1,940,114 Illinois St,,Anaktuvuk Pass,AK,99721,,"114 ILLINOIS ST, ANAKTUVUK PASS, AK 99721",68.139836,-151.738557,Street Address,AK,SUCCESSFUL ADDRESS
2,18472,7801 E 32Nd Ave,,Anchorage,AK,99504,,"7801 E 32ND AVE, ANCHORAGE, AK 99504",61.192277,-149.735520,Street Address,AK,SUCCESSFUL ADDRESS
3,12642,950 East Bogard Road,,Wasilla,AK,99654,,"950 EAST BOGARD ROAD, WASILLA, AK 99654",61.587019,-149.424305,Street Address,AK,SUCCESSFUL ADDRESS
4,15961,12350 Industry Way,,Anchorage,AK,99515,,"12350 INDUSTRY WAY, ANCHORAGE, AK 99515",61.109120,-149.862291,Street Address,AK,SUCCESSFUL ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,13819,13794 Prairie Center Circle,,Cheyenne,WY,82009,,"13794 PRAIRIE CENTER CIRCLE, CHEYENNE, WY 82009",41.149766,-104.646764,Street Address,WY,SUCCESSFUL ADDRESS
506,14191,1550 United States Highway 20 South,,Worland,WY,82401,,"1550 UNITED STATES HIGHWAY 20 SOUTH, WORLAND, ...",43.956469,-108.020552,Street Address,WY,SUCCESSFUL ADDRESS
507,18197,445 South Main St,,Lusk,Wyoming,82225,,"445 South Main St, Lusk, Wyoming 82225",42.760784,-104.452038,Street Address,WY,SUCCESSFUL ADDRESS
508,5281,525 East Birch Street,,Glenrock,WY,82637,,"525 EAST BIRCH STREET, GLENROCK, WY 82637",42.861038,-105.857974,Street Address,WY,SUCCESSFUL ADDRESS


In [29]:
# concatenate dropped US territories to the other random samples
OMOP_sample = pd.concat([dropped_territories, df_sample])
OMOP_sample_path = os.path.join(abs_path, 'output', 'OMOP_sample.csv')

OMOP_sample.to_csv(OMOP_sample_path, index=False)

# Text Similarity Score
* Cosine similarity

In [3]:
nlp = spacy.load("en_core_web_lg")

# address strings
location_source_address = "525 EAST BIRCH STREET, GLENROCK, WY 82637"
parsed_address = "525 East Birch Street, Glenrock, WY 82637"

doc1 = nlp(location_source_address.title())
doc2 = nlp(parsed_address)

print(f"The similarity between \033[1m{location_source_address} \033[0mand \033[1m{parsed_address} is \033[0m{doc1.similarity(doc2)}.")

The similarity between [1m525 EAST BIRCH STREET, GLENROCK, WY 82637 [0mand [1m525 East Birch Street, Glenrock, WY 82637 is [0m0.9866509809411516.


In [4]:
nlp = spacy.load("en_core_web_lg")

# address strings
location_source_address = "525 EAST BIRCH STREET, GLENROCK, WY 82637"
parsed_address = "525 East Birch Street, Glenrock, WY 82637"

doc1 = nlp(location_source_address)
doc2 = nlp(parsed_address)

print(f"The similarity between \033[1m{location_source_address} \033[0mand \033[1m{parsed_address} is \033[0m{doc1.similarity(doc2)}.")

The similarity between [1m525 EAST BIRCH STREET, GLENROCK, WY 82637 [0mand [1m525 East Birch Street, Glenrock, WY 82637 is [0m0.8022178579015921.


In [None]:
nlp = spacy.load("en_core_web_lg")

def text_similarity(source_address, parsed_address):
    """
    Calculate cosine similarity between two strings

    Parameters
    ----------
    source_address (str): original address string input to a geocoder
    parsed_addres (str): returned address string from a geocoder

    Returns
    -------
    score (float)
    """
    # capitalize first letter only of each word
    doc1 = nlp(source_address.title())
    doc2 = nlp(parsed_address.title())

    print(f"The similarity between \033[1m{source_address} \033[0mand \033[1m{parsed_address} is \033[0m{doc1.similarity(doc2)}.")
    return doc1.similarity(doc2)