In [1]:
import spacy
from spacy.matcher import Matcher
import pandas as pd
import numpy as np
import re
from spacy import displacy

In [2]:
def massage_data(address):
    if type(address) == str:
        cleansed_address1=re.sub(r'(,)(?!\s)',', ',address) # add space after comma if not followed by space
        cleansed_address2=re.sub(r'(\\n)',', ',cleansed_address1) # replace newline with comma
        cleansed_address3=re.sub(r'(?!\s)(-)(?!\s)',' - ',cleansed_address2) # add space before and after hyphen
        cleansed_address=re.sub(r'\.','',cleansed_address3) # remove period
        return cleansed_address

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
data = pd.concat([pd.read_csv("data/us-train-dataset.csv"), pd.read_csv("data/us-test-dataset.csv")],axis=0,ignore_index=True)

In [5]:
data.head()

Unnamed: 0,Address,Building_Name,Building_Number,City,Recipient,Street_Name,Zip_Code,State,Country
0,"19 ST ANDREW ST, BULRINGTON, VT, 05401,, Unite...",,19.0,BULRINGTON,,ST ANDREW ST,5401,VT,United States
1,"2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, U...",,2574.0,CHATTANOOGA,,EAST 23RD STREE,37404,TN,United States
2,"5931 W ANGELA RD, MEMPHIS, TN 38120, United St...",,5931.0,MEMPHIS,,W ANGELA RD,38120,TN,United States
3,"3812 MYERS STREET, GREENEVILLE, TN 37743, Unit...",,3812.0,GREENEVILLE,,MYERS STREET,37743,TN,United States
4,"HWY 33 BY-PASS BOX, DYERSBURG, TN 38024, Unite...",,,DYERSBURG,,HWY 33 BY-PASS,38024,TN,United States


In [6]:
addresses = data.Address.apply(massage_data)

In [7]:
# Handling country names

In [8]:
data.Country.unique() # Checking the unique ways in which country names are used

array(['United States', 'USA', 'US'], dtype=object)

In [9]:
country_matcher = Matcher(nlp.vocab) # Creating a matcher object
country_patterns = [
    [{"LOWER": "united"}, {"LOWER": "states"}], # United States
    [{"LOWER": "united"}, {"LOWER": "states"},{"LOWER": "of"}, {"LOWER": "america"}], # United States of America
    [{"LOWER": "us"}], # US
    [{"LOWER": "usa"}], # USA
]
country_matcher.add("COUNTRY", country_patterns) # Adding the patterns to the matcher

In [10]:
countries = [] # Creating a list to store the countries

In [11]:
for i in range(len(addresses)):
    address = addresses[i] # Getting the address
    doc = nlp(address) # Creating a doc object
    matches = country_matcher(doc) # Getting the matches
    if matches: # If there are matches
        country = doc[matches[0][1]:matches[0][2]].text # Getting the country
        countries.append(country) # Appending the country to the list
    else: # If there are no matches
        countries.append(None) # Appending None to the list

In [12]:
(data.Country==countries).all() # Checking if the country names are correctly identified

True

In [13]:
# Handling state names

In [14]:
states = pd.read_csv("data/state-abbrevs.csv") # Reading the state names

In [15]:
states.head() 

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [16]:
state_matcher = Matcher(nlp.vocab) # Creating a matcher object
state_full_patterns = [[{"LOWER": part.lower()} for part in state.split()] for state in states.state] # Creating a list of patterns for the state names
state_abbrev_patterns = [[{"TEXT": state}] for state in states.abbreviation] # Creating a list of patterns for the state abbreviations
state_patterns = state_full_patterns + state_abbrev_patterns # Combining the two lists
state_matcher.add("STATE", state_patterns) # Adding the patterns to the matcher

In [17]:
for i in range(len(addresses)):
    address = addresses[i]
    doc = nlp(address)
    matches = country_matcher(doc)
    index = len(address)
    if matches:
        country = doc[matches[0][1]:matches[0][2]].text
        index = address.find(country)

    matches = state_matcher(doc) # Getting the matches
    if matches: # If there are matches
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the country
        state = doc[intervals[0][0]:intervals[0][1]].text # Getting the state which is closest to the country
        index = address.find(state) # Getting the index at which state is found

In [18]:
# Handling zip codes

In [19]:
zipcode_matcher = Matcher(nlp.vocab) # Creating a matcher object
zipcode_patterns = [
    [{'TEXT':{'REGEX':'^\d{5}$'}}], # Zip code with 5 digits
    [{'TEXT':{'REGEX':'^\d{9}$'}}], # Zip code with 9 digits (if dash is omitted)
    [{"TEXT":{'REGEX':'^\d{5}$'}},{'TEXT':'-'},{'TEXT':{'REGEX':'^\d{4}$'}}], # Zip code with 5 digits and 4 digits after the dash
]
zipcode_matcher.add("ZIPCODE", zipcode_patterns) # Adding the patterns to the matcher

In [20]:
for i in range(len(addresses)):
    address = addresses[i]
    doc = nlp(address)
    
    matches = country_matcher(doc)
    index = len(address)
    if matches:
        country = doc[matches[0][1]:matches[0][2]].text
        index = address.find(country)

    matches = state_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        state = doc[intervals[0][0]:intervals[0][1]].text
        index = address.find(state)
    
    matches = zipcode_matcher(doc) # Getting the matches
    if matches: # If there are matches
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the zip code
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the state
        zipcode = doc[intervals[0][0]:intervals[0][1]].text # Getting the longest zip code which is closest to the state

In [21]:
# Handling city names

In [22]:
city_names = pd.read_csv("data/city_names.csv") # Reading the city names

In [23]:
city_matchers = {} # Creating a dictionary to store the matchers
for state in city_names['State Name'].unique(): # Iterating over the states
    city_matchers[state.lower()] = Matcher(nlp.vocab) # Creating a matcher object for each state
    city_patterns = [[{"LOWER": part.lower()} for part in city.split()] for city in city_names[city_names['State Name'].str.lower()==state.lower()]['City Name']] # Creating a list of patterns for the city names
    city_matchers[state.lower()].add("CITY", city_patterns) # Adding the patterns to the matcher

In [24]:
for i in range(len(addresses)):
    address = addresses[i]
    doc = nlp(address)
    
    matches = country_matcher(doc)
    index = len(address)
    if matches:
        country = doc[matches[0][1]:matches[0][2]].text
        index = address.find(country)

    matches = state_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        state = doc[intervals[0][0]:intervals[0][1]].text
        index = address.find(state)
    
    matches = zipcode_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: x[0]-x[1])
        intervals.sort(key=lambda x: abs(index-x[0]))
        zipcode = doc[intervals[0][0]:intervals[0][1]].text
    
    if len(state)==2: # If the state is 2 letters (abbreviation)
        state = states[states.abbreviation==state].state.values[0] # Getting the full state name
    matches = city_matchers[state.lower()](doc) # Getting the matches
    if matches: # If there are matches
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the state
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the city name
        city = doc[intervals[0][0]:intervals[0][1]].text # Getting the longest city name which is closest to the state

In [25]:
# Handling recipient names

In [26]:
recipent_matcher = Matcher(nlp.vocab) # Creating a matcher object
recipent_patterns = [
    [{"LOWER":"c"},{"LOWER":"/"},{"LOWER":"o"},{'TAG':{'REGEX':'^NNP|NN|CC$'},'OP':'+'}] # Recipient name is preceeded with "c/o"
]
recipent_matcher.add("RECIPENT", recipent_patterns) # Adding the patterns to the matcher

In [27]:
for i in range(len(addresses)):
    address = addresses[i]
    doc = nlp(address)
    
    matches = country_matcher(doc)
    index = len(address)
    if matches:
        country = doc[matches[0][1]:matches[0][2]].text
        index = address.find(country)

    matches = state_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        state = doc[intervals[0][0]:intervals[0][1]].text
        index = address.find(state)
    
    matches = zipcode_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: x[0]-x[1])
        intervals.sort(key=lambda x: abs(index-x[0]))
        zipcode = doc[intervals[0][0]:intervals[0][1]].text
    
    if len(state)==2:
        state = states[states.abbreviation==state].state.values[0]
    matches = city_matchers[state.lower()](doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        intervals.sort(key=lambda x: x[0]-x[1])
        city = doc[intervals[0][0]:intervals[0][1]].text
        address = address.replace(city,"")
    
    matches = recipent_matcher(doc) # Getting the matches
    if matches: # If there are matches
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the recipent name
        recipent = doc[intervals[0][0]:intervals[0][1]].text # Getting the longest recipent name
        address = address.replace(recipent,"") # Replacing the recipent name with an empty string

In [28]:
# Handling street names

In [29]:
street_suffices = pd.read_csv("data/street_suffix.csv",header=None) # Reading the street suffixes
street_suffices = street_suffices[0].str.lower().to_list()  # Getting the street suffixes

In [30]:
def get_suffix(street_name): # Function to get the suffix of the street name
    if type(street_name) == list: # If the street name is a list
        return (street_name)[-1] # Getting the last element of the list

In [31]:
set(data.Street_Name.str.split().apply(get_suffix).str.lower().unique())-set(street_suffices) # Getting the unique street suffixes not in the street_suffices list

{'11-e',
 None,
 'broadway',
 'by-pass',
 'e',
 'hwy',
 'n',
 's',
 'stre',
 'stree',
 'sw',
 'varina',
 'w',
 'west'}

In [32]:
street_suffices += ['stre','stree','by-pass','hwy'] # Adding the required street suffixes to the street_suffices list

In [33]:
street_matcher = Matcher(nlp.vocab) # Creating a matcher object
street_suffix_patterns = [
    [{'TAG':{'REGEX':'^NNP|IN$'},'OP':'*'}, {"LOWER": suffix}, {'LIKE_NUM':True,'OP':'?'}, {"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw)?$"},'OP':'?'}] for suffix in street_suffices] + [ # Matches street name optionally followed by a number and by a direction
    [{'TAG':{'REGEX':'^NNP|IN$'},'OP':'*'}, {"LOWER": suffix}, {'LIKE_NUM':True,'OP':'?'}, {'LOWER':'-'}, {"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw)$"}}] for suffix in street_suffices] + [ # Matches street name followed by a number, a dash, a direction
    [{"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw)?$"},'OP':'?'}, {'TAG':{'REGEX':'^NNP|IN$'},'OP':'*'}, {"LOWER": suffix}] for suffix in street_suffices] + [ # Matches street name optionally preceeded by a direction
    [{'LIKE_NUM':False},{'LOWER':'state','OP':'?'}, {"LOWER": suffix}, {"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw)?$"},'OP':'?'}] for suffix in street_suffices] + [ # Matches street name preceeded by "state" and followed by a direction
    [{'TAG':{'REGEX':'^NNP|IN$'},'OP':'*'}, {"LOWER": suffix}, {'LIKE_NUM':True,'OP':'?'}, {'LOWER':'by'},{'LOWER':'-'},{'LOWER':'pass'}] for suffix in street_suffices] + [ # Matches street name followed by a number, "by-pass"
    [{"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw|w\.|n\.|s\.|e\.)?$"},'OP':'?'}, {"LOWER":{"REGEX":"^\d+(([tT][hH])|([rRnD][dD])|([sS][tT]))$"},'OP':'?'}, {"LOWER": suffix}] for suffix in street_suffices] + [ # Matches street name optionally preceeded by a direction and followed by a number
    [{"LOWER":{"REGEX":"^(north|south|east|west|n|s|e|w|ne|nw|se|sw|w\.|n\.|s\.|e\.)$"}}, {'LIKE_NUM':False,'IS_PUNCT':False}]] # Matches street name preceeded by a direction
street_matcher.add("STREET", street_suffix_patterns) # Adding the street suffixes to the matcher object

In [34]:
#########################################################
# This code is explained in the last section of this file
#########################################################
for i in range(len(addresses)):
# for i in range(1):
    address = addresses[i]
    # address = massage_data("SUITE 87, 3312 JEFFERSON BLVD,HOUSTON,TX,77024,US")
    doc = nlp(address)
    
    country = ''
    matches = country_matcher(doc)
    index = len(address)
    if matches:
        country = doc[matches[0][1]:matches[0][2]].text
        index = address.find(country)
        address = address.replace(country,"")

    state = ''
    matches = state_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        state = doc[intervals[0][0]:intervals[0][1]].text
        index = address.find(state)
        address = address.replace(state,"")
    
    zipcode = ''
    matches = zipcode_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: x[0]-x[1])
        intervals.sort(key=lambda x: abs(index-x[0]))
        zipcode = doc[intervals[0][0]:intervals[0][1]].text
        address = address.replace(zipcode,"")
    
    city = ''
    if len(state)==2:
        state = states[states.abbreviation==state].state.values[0]
    
    if state!='':
        matches = city_matchers[state.lower()](doc)
    else:
        matches = []
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: x[0]-x[1])
        intervals.sort(key=lambda x: abs(index-x[0]))
        city = doc[intervals[0][0]:intervals[0][1]].text
        address = address.replace(city,"")
    
    recipent = ''
    matches = recipent_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: x[0]-x[1])
        recipent = doc[intervals[0][0]:intervals[0][1]].text
        address = address.replace(recipent,"")
    
    street = ''
    doc = nlp(address)
    matches = street_matcher(doc)
    if matches:
        intervals = [[match[1],match[2]] for match in matches]
        intervals.sort(key=lambda x: abs(index-x[0]))
        streets = [doc[intervals[i][0]:intervals[i][1]].text for i in range(len(intervals))]
        l = len(streets)
        i_ind = 0
        while i_ind < l:
            for j_ind in range(l):
                st=streets[j_ind]
                street=streets[i_ind]
                if (st.find(street)>-1)&(st!=street):
                    streets.remove(street)
                    i_ind-=1
                    l-=1
                    break
            i_ind+=1
        i_ind = 0
        while i_ind < l:
            if ((streets[i_ind].lower().find("center")>-1)|(streets[i_ind].lower().find("centre")>-1))&(l>1):
                streets.remove(streets[i_ind])
                i_ind-=1
                l-=1
            i_ind+=1
        street = max(streets,key=len)
        address = address.replace(street,"")
        # if ((street!=massage_data(data.Street_Name[i]))&(type(street)==str)&(type(data.Street_Name[i])==str)):
        #     print(country)
        #     # print(data.Country[i])
        #     print(state)
        #     # print(data.State[i])
        #     print(city)
        #     # print(data.City[i])
        #     print(zipcode)
        #     # print(data.Zip_Code[i])
        #     print(recipent)
        #     # print(data.Recipient[i])
        #     print(street)
        #     # print(massage_data(data.Street_Name[i]))
        #     print(address)
        #     # print(data.Address[i])
        #     print("\n")
    
    # print(country)
    # print(state)
    # print(city)
    # print(zipcode)
    # print(recipent)
    # print(street)
    # print(address)
    # print(data.Building_Number[i])
    # print()

In [35]:
# Handling building numbers

In [36]:
building_number_matcher = Matcher(nlp.vocab) # Initializing the matcher object
building_number_patterns = [
    [{"LOWER":{"REGEX":"^\d+$"}},] # Matches numbers
]
building_number_matcher.add("BUILDING_NUMBER", building_number_patterns) # Adding the building number patterns to the matcher object

In [37]:
building_name_matcher = Matcher(nlp.vocab) # Initializing the matcher object
building_name_patterns = [ 
    [{"LOWER":"building"},{},], # Matches "building" followed by anything
    [{"LOWER":"bldg"},{},], # Matches "bldg" followed by anything
    [{'TAG':{'REGEX':'^NN$'},'OP':'+'}, {"LOWER":"building"},], # Matches any noun followed by "building"
    [{'TAG':{'REGEX':'^NN$'},'OP':'+'}, {"LOWER":"bldg"},], # Matches any noun followed by "bldg"
]
building_name_matcher.add("BUILDING_NAME", building_name_patterns) # Adding the building name patterns to the matcher object

In [38]:
addresses = data.Address.apply(massage_data) 

In [39]:
def get_address_details(address):
    po_box = '' 
    matches = re.search("po box \d+",address.lower()) # Checking if the address contains "po box" followed by a number
    if matches: # If the address contains "po box" followed by a number
        po_box = address[matches.start():matches.end()] # Getting the "po box" followed by a number
        address = address.replace(po_box,"") # Removing the "po box" followed by a number from the address
    
    address = re.sub("[sS][Uu][Ii][Tt][Ee] \d+","",address) # Removing "suite" followed by a number from the address
    address = re.sub("[sS][Tt][Ee] \d+","",address) # Removing "ste" followed by a number from the address
    address = re.sub("# \d+","",address) # Removing "#" followed by a number from the address
    address = re.sub("#\d+","",address) # Removing "#" followed by a number from the address
    address = re.sub("[Ff][Ll] \d+","",address) # Removing "fl" followed by a number from the address
    
    doc = nlp(address) # Tokenizing the address

    country = np.nan # Initializing the country
    matches = country_matcher(doc) # Getting the country
    index = len(address) # Initializing the index
    if matches: # If the address contains a country
        country = doc[matches[0][1]:matches[0][2]].text # Getting the country
        index = address.find(country) # Getting the index of the country
        address = address.replace(country,"") # Removing the country from the address

    state = np.nan # Initializing the state
    matches = state_matcher(doc) # Getting the state
    if matches: # If the address contains a state
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the state
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the country
        state = doc[intervals[0][0]:intervals[0][1]].text # Getting the state closest to the country
        index = address.find(state) # Getting the index of the state
        address = address.replace(state,"") # Removing the state from the address
    
    zipcode = np.nan # Initializing the zipcode
    matches = zipcode_matcher(doc) # Getting the zipcode
    if matches: # If the address contains a zipcode
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the zipcode
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the zipcode
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the state
        zipcode = doc[intervals[0][0]:intervals[0][1]].text # Getting the zipcode closest to the state
        address = address.replace(zipcode,"") # Removing the zipcode from the address
    
    city = np.nan # Initializing the city
    if (type(state)==str): # If the address contains a state
        s = state
        if (len(s)==2): # If the state is a two letter abbreviation
            s = states[states.abbreviation==s].state.values[0] # Getting the state name
        matches = city_matchers[s.lower()](doc) # Getting the city matches of the state
    else: # If the address does not contain a state
        matches = [] # Not getting any city
    if matches: # If the address contains a city
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the city
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the city
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the state
        city = doc[intervals[0][0]:intervals[0][1]].text # Getting the longest city closest to the state
        address = address.replace(city,"") # Removing the city from the address
    
    recipent = np.nan # Initializing the recipent
    matches = recipent_matcher(doc) # Getting the recipent
    if matches: # If the address contains a recipent
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the recipent
        intervals.sort(key=lambda x: x[0]-x[1]) # Sorting the intervals based on the length of the recipent
        recipent = doc[intervals[0][0]:intervals[0][1]].text # Getting the longest recipent closest to the city
        address = address.replace(recipent,"") # Removing the recipent from the address
    
    street = np.nan # Initializing the street
    building_name = np.nan # Initializing the building name
    doc = nlp(address) # Tokenizing the address
    matches = street_matcher(doc) # Getting the street matches
    if matches: # If the address contains a street
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the street
        intervals.sort(key=lambda x: abs(index-x[0])) # Sorting the intervals based on the distance from the state
        streets = [doc[intervals[i][0]:intervals[i][1]].text for i in range(len(intervals))] # Getting the streets
        l = len(streets) # Getting the number of the streets
        i_ind = 0 # Initializing the index of the street

        # The following loop removes any street which is a substring of another street
        while i_ind < l: # Iterating over the streets
            for j_ind in range(l):
                st=streets[j_ind]
                street=streets[i_ind]
                if (st.find(street)>-1)&(st!=street): # If the street is a substring of another street
                    streets.remove(street) # Removing the street
                    i_ind-=1
                    l-=1
                    break
            i_ind+=1
        
        # The following loop removes any street which contains "center" or "centre"
        i_ind = 0
        centres = []
        while i_ind < l:
            if ((streets[i_ind].lower().find("center")>-1)|(streets[i_ind].lower().find("centre")>-1))&(l>1): # If the street contains "center" or "centre" and there are more than one street
                centres.append(streets[i_ind]) # Appending the street to the list of centres
                streets.remove(streets[i_ind]) # Removing the street
                i_ind-=1
                l-=1
            i_ind+=1
        
        street = max(streets,key=len) # Getting the longest street
        if ((street.lower().find("center")==-1)|(street.lower().find("centre")==-1))&(len(centres)>0): # If the street does not contain "center" or "centre" and there are centres
            building_name = max(centres,key=len) # Getting the longest centre as the building name
            address = address.replace(building_name,"") # Removing the building name from the address
        address = address.replace(street,"") # Removing the street from the address

    doc = nlp(address) # Tokenizing the address
    matches = building_name_matcher(doc) # Getting the building name matches
    if matches: # If the address contains a building name
        intervals = [[match[1],match[2]] for match in matches] # Getting the intervals of matches of the building name
        intervals.sort(key=lambda x: abs(x[0]-x[1])) # Sorting the intervals based on the length of the building name
        building_name = doc[matches[0][1]:matches[0][2]].text # Getting the first match of the building name
        address = address.replace(building_name,"") # Removing the building name from the address

    building_number = np.nan # Initializing the building number
    doc = nlp(address) # Tokenizing the address
    matches = building_number_matcher(doc) # Getting the building number matches
    if matches: # If the address contains a building number
        building_number = doc[matches[0][1]:matches[0][2]].text # Getting the first match of the building number
        address = address.replace(building_number,"") # Removing the building number from the address
    
    return [country,state,city,zipcode,recipent,street,building_number,building_name] # Returning the address components

In [40]:
new_data = [] # List to store the new data
for i in range(len(addresses)): # Iterating over the addresses
    address = addresses[i] # Getting the address
    components = get_address_details(address) # Getting the address components
    new_data.append([address]+components) # Appending the data to the new data

In [41]:
pd.DataFrame(new_data,columns=["address","country","state","city","zipcode","recipent","street","building_number","building_name"]) # Converting the new data to a dataframe

Unnamed: 0,address,country,state,city,zipcode,recipent,street,building_number,building_name
0,"19 ST ANDREW ST, BULRINGTON, VT, 05401, , Unit...",United States,VT,,05401,,ST ANDREW ST,19,
1,"2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, U...",United States,TN,CHATTANOOGA,37404,,EAST 23RD STREE,2574,
2,"5931 W ANGELA RD, MEMPHIS, TN 38120, United St...",United States,TN,MEMPHIS,38120,,W ANGELA RD,5931,
3,"3812 MYERS STREET, GREENEVILLE, TN 37743, Unit...",United States,TN,GREENEVILLE,37743,,MYERS STREET,3812,
4,"HWY 33 BY - PASS BOX, DYERSBURG, TN 38024, Uni...",United States,TN,DYERSBURG,38024,,HWY 33 BY - PASS,,
...,...,...,...,...,...,...,...,...,...
139,"155 Epps Bridge Pkwy Bldg 100 Ste 201, ATHENS,...",US,GA,ATHENS,30606 - 3347,,Epps Bridge Pkwy,155,Bldg 100
140,"Haliday Bldg 3 Golly Ln Dept RB08, MIDDLE ISLA...",US,NY,MIDDLE ISLAND,11953 - 0102,,Golly Ln,,Bldg 3
141,"6319 Sta Point Ct # J, WINTER PARK, FL, 32792 ...",US,FL,WINTER PARK,32792 - 8214,,Sta Point Ct,6319,
142,"2376 Park View Ct Ste 240, OXNARD, CA, 93036 -...",US,CA,OXNARD,93036 - 5458,,Park View Ct,2376,


In [42]:
# Visualization

In [43]:
def get_address_span(address=None,address_component=None,label=None):
    if pd.isna(address_component) or str(address_component)=='nan': # If the address component is not present
        # Don't do anything if the address or address component is not present
        pass
    else: # If the address component is present
        # Find the address component in the address and return start and end indices
        address_component1=re.sub('\.','',address_component) 
        address_component2=re.sub(r'(?!\s)(-)(?!\s)',' - ',address_component1) 
        span=re.search('\\b(?:'+address_component2+')\\b',address)
        return (span.start(),span.end(),label)

In [44]:
def extend_list(entity_list,entity):
    # Method to extend list
    if pd.isna(entity):
        return entity_list
    else:
        entity_list.append(entity)
        return entity_list

In [45]:
def create_entity_spans(address,components):
    # Obtaining entity span and labels of the address components from the given data
    address = massage_data(address)
    building_tag = get_address_span(address,components[7],label='BUILDING_NAME')
    building_no_tag = get_address_span(address,components[6],label='BUILDING_NO')
    recipent_tag = get_address_span(address,components[4],label='RECIPENT')
    street_name_tag = get_address_span(address,components[5],label='STREET_NAME')
    zipcode_tag = get_address_span(address,components[3],label='ZIPCODE')
    city_tag = get_address_span(address,components[2],label='CITY')
    state_tag = get_address_span(address,components[1],label='STATE')
    country_tag = get_address_span(address,components[0],label='COUNTRY')
    empty_list = []
    component_tags = [country_tag,state_tag,city_tag,zipcode_tag,recipent_tag,street_name_tag,building_no_tag,building_tag]

    for i in component_tags:
        empty_list = extend_list(empty_list,i)
    return (address,empty_list)

In [46]:
def visualize_components(address,components):
    # Method to add named entities to the address
    doc = nlp(address) #Construct a Doc object
    ents = []
    for start, end, label in components:
        span = doc.char_span(start, end, label=label)
        ents.append(span)
    doc.ents = ents
    colors = {'BUILDING_NAME': 'Aquamarine', 'BUILDING_NO': 'lavender', 'RECIPIENT': 'Coral', 'STREET_NAME': 'yellow', 'ZIP_CODE': 'DarkSeaGreen', 'CITY': 'orange', 'COUNTRY': 'MistyRose', 'STATE': 'pink'}
    options = {"colors": colors}
    displacy.render(doc, style="ent", options=options)

In [47]:
address_list = ["130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA",
              "8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA",
              "PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA",
              "C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA",
              "Seven Edgeway Plaza, C/O Mac Dermott Inc, OAKBROOK TERRACE, IL, 60181, USA"]

In [48]:
for address in address_list:
    components = get_address_details(address)
    address,components = create_entity_spans(address,components)
    visualize_components(address,components)

In [49]:
def create_address_span(components):
    components.sort()
    start = components[0][0]
    end = components[0][1]
    address_spans = []
    for i in range(1,len(components)):
        if (components[i][0]-end>=3):
            address_spans.append((start,end,'ADDRESS'))
            start = components[i][0]
        elif (i==len(components)-1):
            end = components[i][1]
            address_spans.append((start,end,'ADDRESS'))
        end = components[i][1]
    return address_spans

In [50]:
def visualize_address(address,address_spans):
    # Method to add named entities to the address
    doc = nlp(address) #Construct a Doc object
    ents = []
    for start, end, label in address_spans:
        span = doc.char_span(start, end, label=label)
        ents.append(span)
    doc.ents = ents
    colors = {'ADDRESS': 'SKYBLUE'}
    options = {"colors": colors}
    displacy.render(doc, style="ent", options=options)

In [51]:
address = "My name is Lalitha, I live in 2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, United States"
components = get_address_details(address)
address,components = create_entity_spans(address,components)
address_spans = create_address_span(components)
visualize_address(address,address_spans)