# Fix Active Business Addresses

In [1]:
# Data processing
import numpy as np
import pandas as pd
import re # Regular Expressions
import math
import googlemaps
from datetime import datetime

# Common DGLIM utilities
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import dglim
dglim.setProjectPath('../../')

from dglim import City

*** Connect to Google Maps API ***

In [2]:
gmaps = googlemaps.Client(key=dglim.getGoogleAPIKey());

*** Load business locations ***

In [3]:
raw_locations_df = pd.read_excel(
    dglim.datasets_path + 'DGLIM Survey Data/city of Gainesville active businesses dataset with uniqueid (fixed coords).xlsx') \
    [['uniqueid', 'Physical Address', 'Location']]

raw_locations_df[:5]

Unnamed: 0,uniqueid,Physical Address,Location
0,1,1215 NW 12TH AVE,"1215 NW 12TH AVE\nGAINESVILLE, FL\n(29.663009,..."
1,2,1311 NW 5TH AVE,"1311 NW 5TH AVE\nGAINESVILLE, FL\n(29.655734, ..."
2,3,2224 NW 13TH ST,"2224 NW 13TH ST\nGAINESVILLE, FL\n(29.673707, ..."
3,4,"4101 NW 37TH PL, SUITE B","4101 NW 37TH PL\nSUITE B GAINESVILLE, FL\n(29...."
4,5,7605 NW 13TH ST,"7605 NW 13TH ST\nGAINESVILLE, FL\n(29.726447, ..."


## Add GPS Coordinates

In [4]:
locations_df = raw_locations_df[['uniqueid', 'Physical Address']] \
    .rename(columns={
        'uniqueid': 'DGLIM ID',
        'Physical Address': 'Address'
    }).set_index('DGLIM ID', drop=False).copy()

In [5]:
def extractCoordinates(string):
    # Extract string coordinates from location entry
    string = str(string) # Convert unicode to str if necessary
    values_list = [x for x in re.split('[()]', string) if x != '']

    if len(values_list) > 1:
        str_coords = values_list[len(values_list) - 1]
        coords = str_coords.split(',')
        return [float(coords[0]), float(coords[1])]
    else:
        return [np.nan, np.nan]

In [6]:
# Extract GPS coordinates from 'Location'
locations_df['Latitude'], locations_df['Longitude'] = [np.nan, np.nan]
locations_df[['Latitude', 'Longitude']] = map(lambda x: extractCoordinates(x), raw_locations_df['Location'].tolist())

locations_df[:1]

Unnamed: 0_level_0,DGLIM ID,Address,Latitude,Longitude
DGLIM ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1215 NW 12TH AVE,29.663009,-82.337822


*** Look up missing coordinates ***

In [7]:
n = len(locations_df) - len(locations_df.dropna(subset=['Latitude', 'Longitude']))
print n, "missing addresses"

737 missing addresses


In [8]:
# Get businesses that don't have coordinates listed
missing_df = locations_df[locations_df['Latitude'].fillna(0) == 0]
generic_addresses = {
    'MOBILE ONLY',
    'VARIOUS',
    'OUT OF STATE',
    'out OF STATE'
}

# Filter out businesses without address data
missing_df = missing_df[~missing_df['Address'].isin(generic_addresses)]

# Let's print 50 progress dots
milestone_size = 1.0/50.0
progress_inc = 1.0/float(len(missing_df))

print "Looking up coordinates for", len(missing_df), "businesses..."

# Sample progress bar to gauge actual progress
print "[",
for i in range(0, 49):
    print ".",
print "]"

# Look up coordinates by address
progress = 0
next_milestone = milestone_size

print "[",
for i, row in missing_df.iterrows():
    address = row['Address'] + ', Gainesville FL'
    lat, lng = gmaps.geocode(address)[0]['geometry']['location'].values()
    locations_df.set_value(col='Latitude', index=row.name, value=lat)
    locations_df.set_value(col='Longitude', index=row.name, value=lng)
    
    #print "Found", lat, lng, "for", address
    progress += progress_inc
    if (progress >= next_milestone):
        print ".",
        next_milestone += milestone_size
print "]"
print "Done!"

Looking up coordinates for 203 businesses...
[ . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ]
[ . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ]
Done!


In [9]:
# Nullify generic address coordinates
missing_addresses = (locations_df['Address'].isin(generic_addresses))
locations_df.loc[missing_addresses, ['Latitude','Longitude']] = np.nan

*** How many are missing coordinates? ***

In [10]:
n = len(locations_df) - len(locations_df.dropna(subset=['Latitude', 'Longitude']))
print n, "missing addresses"

568 missing addresses


In [11]:
n = len(locations_df[locations_df['Address'].apply(lambda x: x in generic_addresses)])
print n, "don't have an address"

568 don't have an address


*** How many addresses are oustide Gainesville? ***

In [12]:
n = len(locations_df[(abs(locations_df['Latitude'] - City.latitude) > .2) | (abs(locations_df['Longitude'] - City.longitude) > .2)].sort_index())
print n, "locations are outside Gainesville"

0 locations are outside Gainesville


*** Save data ***

In [14]:
dglim.saveData(locations_df, 'Active Business Locations')

In [15]:
locations_df.head()

Unnamed: 0_level_0,DGLIM ID,Address,Latitude,Longitude
DGLIM ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1215 NW 12TH AVE,29.663009,-82.337822
2,2,1311 NW 5TH AVE,29.655734,-82.33997
3,3,2224 NW 13TH ST,29.673707,-82.339143
4,4,"4101 NW 37TH PL, SUITE B",29.687533,-82.387161
5,5,7605 NW 13TH ST,29.726447,-82.370726
