# Identify Business Parcels

In [1]:
import numpy as np
import pandas as pd
import re
import usaddress
from collections import OrderedDict

# Common DGLIM utilities
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import dglim
dglim.setProjectPath('../../')

%matplotlib inline

## Read and Format Data

In [2]:
def formatUnit(unit):
    unit = str(unit).upper();

    # Remove anthing after an & sign - e.g. for STE A & B, the parcel is most likely associated with just STE A
    unit = unit.split('&')[0]

    # Remove special characters
    unit = re.sub(r'[^\w ]', '', unit)

    # Remove white space
    unit = re.sub(r'[\s]', '', unit).strip()

    # Shorten whole words to their abbreviations
    translation_map = OrderedDict([
        ('SUITE', 'STE'),
        ('SUIT', 'STE'),
        ('STES', 'STE'),
        ('FLOOR', 'FL'),
        ('STREET', 'ST'),
        ('COURT', 'CT')
    ])
    for x in translation_map:
        unit = unit.replace(x, translation_map[x])
    
    # Since addresses usually won't have mixed unit types, let's get rid of it
    unit_labels = [
        'APT',
        'BLDG',
        'LOT',
        'RM',
        'STE',
        'UNIT',
    ]
    for x in unit_labels:
        unit = unit.replace(x, '')
    
    # Remove excess white space
    unit = ' '.join(unit.split())

    return unit

In [3]:
# Alachua County Fire Rescue data that maps addresses to parcel numbers
acfr_df = pd.read_csv(dglim.datasets_path + 'ACFR_Addressess.csv')
acfr_df['UNIT'] = acfr_df['UNIT'].apply(formatUnit)

# Extract addresses and parcels for addresses in Gainesville
address_parcels_df = acfr_df[acfr_df['CITY'] == 'GAINESVILLE'][[
    'ADDRNUM',
    'ROADPREDIR',
    'ROADNAME',
    'ROADTYPE',
    'UNIT',
    'PARCEL'
]].rename(columns={
    'PARCEL': 'Parcel'
}).astype(str)

# Remove trailing spaces
# * This is important because empty values are ' ' rather than ''
address_parcels_df = address_parcels_df.apply(lambda x:
    x.apply(lambda y: y.rstrip()))

# Remove rows with missing data
bad_parcels = (address_parcels_df['Parcel'].isin([' ', '<New parcel>', np.nan]))
address_parcels_df = address_parcels_df[~bad_parcels]
address_parcels_df['Parcel'] = address_parcels_df['Parcel'].apply(lambda x : x.replace('-',''))

address_parcels_df.drop_duplicates(inplace=True)
address_parcels_df[:1]

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ADDRNUM,ROADPREDIR,ROADNAME,ROADTYPE,UNIT,Parcel
0,4104,NW,13TH,PL,,6397057000


In [4]:
def getAddressTags(address):
    columns = ['Address', 'ADDRNUM', 'ROADPREDIR', 'ROADNAME', 'ROADTYPE', 'UNIT']
    tag_dict = dict.fromkeys(columns, '')

    try:
        tags = usaddress.tag(address)[0]
#         tags = usaddress.tag(address.split(',')[0])[0]
    except:
        if ',' in address:
            return getAddressTags(address.split(',')[:-1])
        else:
            return tag_dict

    tag_dict['Address'] = address
    tag_dict['ADDRNUM'] = tags['AddressNumber'].lstrip('0') if 'AddressNumber' in tags else ''
    tag_dict['ROADPREDIR'] = tags['StreetNamePreDirectional'] if 'StreetNamePreDirectional' in tags else ''
    tag_dict['ROADNAME'] = tags['StreetName'] if 'StreetName' in tags else ''
    tag_dict['ROADTYPE'] = formatUnit(tags['StreetNamePostType']) if 'StreetNamePostType' in tags else ''

    if 'OccupancyType' in tags and 'OccupancyIdentifier' in tags:
        tag_dict['UNIT'] = formatUnit(tags['OccupancyType'] + ' ' + tags['OccupancyIdentifier']) # e.g. APT 15
    else:
        tag_dict['UNIT'] = ''

    return tag_dict;

In [5]:
# Read business data
business_locations_df = dglim.loadData('Active Business Locations')

# Extract address components
merged_locations_df = business_locations_df[['DGLIM ID', 'Address']].drop_duplicates(subset=['Address']).set_index(['DGLIM ID'])
address_parts_df = (merged_locations_df['Address'].apply(getAddressTags)).apply(pd.Series).reset_index()

# Match addresses with parcels
merged_df = pd.merge(left=address_parts_df, right=address_parcels_df,
                     how='left', on=['ADDRNUM', 'ROADPREDIR', 'ROADNAME', 'ROADTYPE', 'UNIT'],
                     ).drop_duplicates(subset=['DGLIM ID']).set_index('DGLIM ID')

business_parcels_df = merged_df[['Parcel']].dropna()

# Save data
dglim.saveData(business_parcels_df, 'Active Business Parcels')

In [11]:
temp_df = business_parcels_df[business_parcels_df['Parcel'] != '']

total = len(merged_locations_df)
matched = len(business_parcels_df)
duplicates = matched - len(business_parcels_df['Parcel'].drop_duplicates())
print matched, '\tmatched'
print total - matched, '\tstill missing'
print duplicates, '\tduplicates'

counts = business_parcels_df['Parcel'].value_counts()
print len(counts[counts > 1]), '\tparcels are shared'
print sum(counts[counts > 1]), '\tbusinesses share a parcel with at least one other business'

3447 	matched
887 	still missing
983 	duplicates
313 	parcels are shared
1296 	businesses share a parcel with at least one other business
