In [2]:
import os
from os import listdir
from os.path import isfile, join, splitext
import sys
import csv
import re

import numpy as np
import pandas as pd
import geopandas as gpd

import usaddress

import matplotlib.pyplot as plt

# Parsing with `usaddress` module

## Edge Cases

In [3]:
# Queens, NY address with dashes in street address
queens_ny_address = "89-22 197th St Unit 2, Hollis, NY 11423"
usaddress.tag(queens_ny_address)

(OrderedDict([('AddressNumber', '89-22'),
              ('StreetName', '197th'),
              ('StreetNamePostType', 'St'),
              ('OccupancyType', 'Unit'),
              ('OccupancyIdentifier', '2'),
              ('PlaceName', 'Hollis'),
              ('StateName', 'NY'),
              ('ZipCode', '11423')]),
 'Street Address')

In [4]:
# Southwest address with Spanish names
NM_address = "220 Camino Tres SW, Albuquerque, NM 87105"
usaddress.tag(NM_address)

(OrderedDict([('AddressNumber', '220'),
              ('StreetNamePreType', 'Camino'),
              ('StreetName', 'Tres'),
              ('StreetNamePostDirectional', 'SW'),
              ('PlaceName', 'Albuquerque'),
              ('StateName', 'NM'),
              ('ZipCode', '87105')]),
 'Street Address')

In [5]:
# grid system with no street type
UT_address = "921 3385 S, Millcreek, UT 84106"
usaddress.tag(UT_address)

(OrderedDict([('AddressNumber', '921'),
              ('StreetName', '3385'),
              ('StreetNamePostDirectional', 'S'),
              ('PlaceName', 'Millcreek'),
              ('StateName', 'UT'),
              ('ZipCode', '84106')]),
 'Street Address')

In [8]:
link = os.path.join(os.getcwd(),'..','..','test_address_20000_v2.csv')

temp = pd.read_csv(link)
temp

Unnamed: 0,Full_Addre,source_lon,source_lat,geometry,user_id,start_date,end_duration,end_date
0,"18248 HILLSIDE LN, DOLAND, SD 57436",-98.048963,44.737366,POINT (-98.04896294189227 44.73736569324581),1,2004-12-09,604 days,2006-08-05
1,"244 ELM ST, HILL CITY, SD 57745",-103.574269,43.932597,POINT (-103.57426900030956 43.93259699973685),1,2006-02-04,946 days,2008-09-07
2,"PO BOX 9823, BEND, OR 97708",-121.344782,43.978130,POINT (-121.34478200035224 43.97813000011848),1,2018-07-12,456 days,2019-10-11
3,"10208 Bluegrass Road, Knoxville, TN 37922",-84.106168,35.860066,POINT (-84.10616838597446 35.86006566995218),2,2001-05-11,790 days,2003-07-10
4,"7338 SOUTH ALTON WAY, CENTENNIAL, CO 80112",-104.882937,39.583518,POINT (-104.88293699961378 39.58351800019911),2,2004-01-09,876 days,2006-06-03
...,...,...,...,...,...,...,...,...
19995,"1057 LINCOLN ST SW, ALBANY, OR 97321",-123.122892,44.627669,POINT (-123.12289199968363 44.62766900001488),4999,2006-04-17,599 days,2007-12-07
19996,"384 COURT ST, PLYMOUTH, MA 02360",-70.691150,41.978494,POINT (-70.69115000008155 41.97849399980115),4999,2010-05-21,662 days,2012-03-13
19997,"5037 32ND AVE, HUDSONVILLE, MI 49426",-85.862753,42.861220,POINT (-85.86275334972652 42.86122021823103),4999,2012-09-17,652 days,2014-07-01
19998,"2525 GLENN HENDREN DR, LIBERTY, MO 64068",-94.424104,39.274506,POINT (-94.42410418 39.27450552),4999,2016-07-19,553 days,2018-01-23


In [7]:
Pub28_usaddre_template = {
   'Recipient': 'recipient',
   'AddressNumber': 'address1',
   'AddressNumberPrefix': 'address1',
   'AddressNumberSuffix': 'address1',
   'StreetName': 'address1',
   'StreetNamePreDirectional': 'address1',
   'StreetNamePreModifier': 'address1',
   'StreetNamePreType': 'address1',
   'StreetNamePostDirectional': 'address1',
   'StreetNamePostModifier': 'address1',
   'StreetNamePostType': 'address1',
   'CornerOf': 'address1',
   'IntersectionSeparator': 'address1',
   'LandmarkName': 'address1',
   'USPSBoxGroupID': 'address1',
   'USPSBoxGroupType': 'address1',
   'USPSBoxID': 'address1',
   'USPSBoxType': 'address1',
   'BuildingName': 'address2',
   'OccupancyType': 'address2',
   'OccupancyIdentifier': 'address2',
   'SubaddressIdentifier': 'address2',
   'SubaddressType': 'address2',
   'PlaceName': 'city',
   'StateName': 'state',
   'ZipCode': 'zip_code',
}

In [22]:
# 1) identify unique address strings for location records
temp.loc[:,['Full_Addre']].drop_duplicates()
#temp.loc[:,['Full_Addre','source_lon','source_lat']].drop_duplicates()

#


Unnamed: 0,Full_Addre
0,"18248 HILLSIDE LN, DOLAND, SD 57436"
1,"244 ELM ST, HILL CITY, SD 57745"
2,"PO BOX 9823, BEND, OR 97708"
3,"10208 Bluegrass Road, Knoxville, TN 37922"
4,"7338 SOUTH ALTON WAY, CENTENNIAL, CO 80112"
...,...
19995,"1057 LINCOLN ST SW, ALBANY, OR 97321"
19996,"384 COURT ST, PLYMOUTH, MA 02360"
19997,"5037 32ND AVE, HUDSONVILLE, MI 49426"
19998,"2525 GLENN HENDREN DR, LIBERTY, MO 64068"


In [76]:
# 2) set up OMOP placeholder table
# OMOP_location = pd.DataFrame(data=None,
#                              columns=['Location_id','address_1','address_2','city','state','zip','county',
#                                       'location_source_value','latitude','longitude'],
                             
#                              dtype = {'Location_id':int,
#                                       'address_1':'object',
#                                       'address_2':str,
#                                       'city':str,
#                                       'state':str,
#                                       'zip':str,
#                                       'county':str,
#                                       'location_source_value':str,
#                                       'latitude':float,
#                                       'longitude':float})


OMOP_location = pd.DataFrame(columns=['Location_id','address_1','address_2','city','state','zip','county',
                                      'location_source_value','latitude','longitude'])
                             
# need to change datatype for latitude and longitude
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude


In [87]:
# 3) assign address_strings to OMOP_location table location_source_value series
OMOP_location['location_source_value'] = temp.loc[:,['Full_Addre']].drop_duplicates()['Full_Addre']

OMOP_location['Location_id'] = OMOP_location.re+1
OMOP_location

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude
0,1,,,,,,,"18248 HILLSIDE LN, DOLAND, SD 57436",,
1,2,,,,,,,"244 ELM ST, HILL CITY, SD 57745",,
2,3,,,,,,,"PO BOX 9823, BEND, OR 97708",,
3,4,,,,,,,"10208 Bluegrass Road, Knoxville, TN 37922",,
4,5,,,,,,,"7338 SOUTH ALTON WAY, CENTENNIAL, CO 80112",,
...,...,...,...,...,...,...,...,...,...,...
19995,19996,,,,,,,"1057 LINCOLN ST SW, ALBANY, OR 97321",,
19996,19997,,,,,,,"384 COURT ST, PLYMOUTH, MA 02360",,
19997,19998,,,,,,,"5037 32ND AVE, HUDSONVILLE, MI 49426",,
19998,19999,,,,,,,"2525 GLENN HENDREN DR, LIBERTY, MO 64068",,


In [80]:


pd.concat(OMOP_location,temp.loc[:,['Full_Addre']].drop_duplicates(), how='right', left_on='location_source_value', right_on='Full_Addre', ignore_index=True)

TypeError: concat() got an unexpected keyword argument 'how'

In [37]:
for ind, each in temp.loc[:,['Full_Addre']].drop_duplicates().iterrows():
    print(each.Full_Addre)
    
    if ind==10:
        break

18248 HILLSIDE LN, DOLAND, SD 57436
244 ELM ST, HILL CITY, SD 57745
PO BOX 9823, BEND, OR 97708
10208 Bluegrass Road, Knoxville, TN 37922
7338 SOUTH ALTON WAY, CENTENNIAL, CO 80112
102 East Main Street, Camden, Tennessee 38320
312 CANTRIL STREET, CASTLE ROCK, CO 80104
400 HARVARD AVE, ROSEBURG, OR 97470
25 MILES E OF TUBA CITY HWY 160, TONALEA, AZ 86044
9333 IMPERIAL HWY, DOWNEY, CA 90242
49 MIDDLE HWY, BARRINGTON, RI 02806


In [102]:
obj[0]

OrderedDict([('address1', '1400 SOUTH SIXTH'),
             ('city', 'LOVINGTON'),
             ('state', 'NM'),
             ('zip_code', '88260')])

In [109]:
# 4) process address string through usaddress parser with Pub28 template

repo = pd.DataFrame()

# iter
for ind, each in OMOP_location.loc[:,['location_source_value']].drop_duplicates().iterrows():

    # try Pub28 parsing
    try:
        obj = usaddress.tag(each.location_source_value, tag_mapping=Pub28_usaddre_template)
        
        # staging
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]
        
        # development
        OMOP_location.loc[ind, 'address_1'] = tmp['address1']
        OMOP_location.loc[ind, 'city'] = tmp['city']
        OMOP_location.loc[ind, 'state'] = tmp['state']
        OMOP_location.loc[ind, 'zip'] = tmp['zip_code']
        OMOP_location.loc[ind, 'address_type']=tmp['Address_type']

        repo = repo.append(tmp)
    
    except:
        print(ind, each.location_source_value)

    
    #print(obj[0])
    
    if ind==15:
        break
        
repo

0 18248 HILLSIDE LN, DOLAND, SD 57436
1 244 ELM ST, HILL CITY, SD 57745
2 PO BOX 9823, BEND, OR 97708
3 10208 Bluegrass Road, Knoxville, TN 37922
4 7338 SOUTH ALTON WAY, CENTENNIAL, CO 80112
5 102 East Main Street, Camden, Tennessee 38320
6 312 CANTRIL STREET, CASTLE ROCK, CO 80104
7 400 HARVARD AVE, ROSEBURG, OR 97470
8 25 MILES E OF TUBA CITY HWY 160, TONALEA, AZ 86044
9 9333 IMPERIAL HWY, DOWNEY, CA 90242
10 49 MIDDLE HWY, BARRINGTON, RI 02806
11 131 S. MAIN STREET, PIMA, AZ 85543
12 2828 BLANFORD AVE SW, ALBUQUERQUE, NM 87121
13 PO BOX 2745, GLOBE, AZ 85502
14 2500 CORONA DR NW, ALBUQUERQUE, NM 87120
15 1400 SOUTH SIXTH, LOVINGTON, NM 88260


In [111]:
OMOP_location.loc[0]

Location_id                                                1
address_1                                                NaN
address_2                                                NaN
city                                                     NaN
state                                                    NaN
zip                                                      NaN
county                                                   NaN
location_source_value    18248 HILLSIDE LN, DOLAND, SD 57436
latitude                                                 NaN
longitude                                                NaN
address_type                                             NaN
Name: 0, dtype: object