In [1]:
import pandas as pd
import os
import glob
from time import sleep
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
import numpy as np

## First, concatenate all CSVs & export

In [12]:
# collect all CSVs and combine into one DF
extension = 'csv'
result = glob.glob('2023/*.{}'.format(extension))
joined_df = pd.concat((pd.read_csv(f) for f in result)).drop_duplicates().reset_index(drop=True)

# filter out bad transactions
joined_df['salePrice_numeric'] = joined_df['Sale Price'].replace('[\$,]', '', regex=True).astype(float)
joined_df = joined_df[joined_df['salePrice_numeric'] >= 10000]
joined_df = joined_df[joined_df['Square Ft '] > 75]

# from this, create the price per SF column
joined_df['price_SF'] = joined_df['salePrice_numeric'] / joined_df['Square Ft '] 

# pare down the columns
joined_df = joined_df[[
    'Address',
    'Sale Date',
    'Sale Price',
    'salePrice_numeric',
    'price_SF',
    'Square Ft ',
    'Year  Built '
]]

joined_df

Unnamed: 0,Address,Sale Date,Sale Price,salePrice_numeric,price_SF,Square Ft,Year Built
3,3530 PIEDMONT RD NE # F 11,2/27/2023,"$375,000.00",375000.0,245.579568,1527,1976.0
11,968 WELCH ST SE,2/28/2023,"$127,000.00",127000.0,127.000000,1000,1952.0
14,845 SPRING ST NW 229,2/28/2023,"$588,500.00",588500.0,349.258160,1685,2002.0
21,606 WILLARD AVE SW,2/28/2023,"$199,000.00",199000.0,52.107882,3819,1930.0
22,839 ROCHELLE DR SW,2/28/2023,"$425,000.00",425000.0,204.130644,2082,1946.0
...,...,...,...,...,...,...,...
2286,2516 FORREST WAY NE,3/1/2023,"$650,000.00",650000.0,417.201540,1558,1935.0
2288,484 TARA TRL NW,3/1/2023,"$1,320,000.00",1320000.0,407.407407,3240,1958.0
2289,727 HOLMES ST NW,3/1/2023,"$450,000.00",450000.0,390.625000,1152,1940.0
2291,1390 EAST FORREST AVE,3/1/2023,"$211,000.00",211000.0,257.631258,819,1940.0


## Baby...preparrrre to geocode!

In [13]:
# create the full address
joined_df['full_address'] = joined_df['Address'] + ' Fulton County, GA'

# set the 'space filler' that Google Maps uses on a browser, then read in address data
space_filler = '%20'
joined_df['Address_URL'] = joined_df['full_address'].str.replace(' ', space_filler)

# creates the url which will be used to return the response below
joined_df['url'] = ['https://www.google.com/maps/search/' + i for i in joined_df['Address_URL']]

# # only split up the batch if you have a big chunk to geocode
# df1, df2 = np.array_split(joined_df, 2)
# df1.to_csv('fultonGeocode_batch1.csv')
# df2.to_csv('fultonGeocode_batch2.csv')

joined_df

Unnamed: 0,Address,Sale Date,Sale Price,salePrice_numeric,price_SF,Square Ft,Year Built,full_address,Address_URL,url
3,3530 PIEDMONT RD NE # F 11,2/27/2023,"$375,000.00",375000.0,245.579568,1527,1976.0,"3530 PIEDMONT RD NE # F 11 Fulton County, GA",3530%20PIEDMONT%20RD%20NE%20#%20F%2011%20Fulto...,https://www.google.com/maps/search/3530%20PIED...
11,968 WELCH ST SE,2/28/2023,"$127,000.00",127000.0,127.000000,1000,1952.0,"968 WELCH ST SE Fulton County, GA","968%20WELCH%20ST%20SE%20Fulton%20County,%20GA",https://www.google.com/maps/search/968%20WELCH...
14,845 SPRING ST NW 229,2/28/2023,"$588,500.00",588500.0,349.258160,1685,2002.0,"845 SPRING ST NW 229 Fulton County, GA",845%20SPRING%20ST%20NW%20229%20Fulton%20County...,https://www.google.com/maps/search/845%20SPRIN...
21,606 WILLARD AVE SW,2/28/2023,"$199,000.00",199000.0,52.107882,3819,1930.0,"606 WILLARD AVE SW Fulton County, GA","606%20WILLARD%20AVE%20SW%20Fulton%20County,%20GA",https://www.google.com/maps/search/606%20WILLA...
22,839 ROCHELLE DR SW,2/28/2023,"$425,000.00",425000.0,204.130644,2082,1946.0,"839 ROCHELLE DR SW Fulton County, GA","839%20ROCHELLE%20DR%20SW%20Fulton%20County,%20GA",https://www.google.com/maps/search/839%20ROCHE...
...,...,...,...,...,...,...,...,...,...,...
2286,2516 FORREST WAY NE,3/1/2023,"$650,000.00",650000.0,417.201540,1558,1935.0,"2516 FORREST WAY NE Fulton County, GA","2516%20FORREST%20WAY%20NE%20Fulton%20County,%20GA",https://www.google.com/maps/search/2516%20FORR...
2288,484 TARA TRL NW,3/1/2023,"$1,320,000.00",1320000.0,407.407407,3240,1958.0,"484 TARA TRL NW Fulton County, GA","484%20TARA%20TRL%20NW%20Fulton%20County,%20GA",https://www.google.com/maps/search/484%20TARA%...
2289,727 HOLMES ST NW,3/1/2023,"$450,000.00",450000.0,390.625000,1152,1940.0,"727 HOLMES ST NW Fulton County, GA","727%20HOLMES%20ST%20NW%20Fulton%20County,%20GA",https://www.google.com/maps/search/727%20HOLME...
2291,1390 EAST FORREST AVE,3/1/2023,"$211,000.00",211000.0,257.631258,819,1940.0,"1390 EAST FORREST AVE Fulton County, GA","1390%20EAST%20FORREST%20AVE%20Fulton%20County,...",https://www.google.com/maps/search/1390%20EAST...


In [14]:
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

results = []

for ind in tqdm(joined_df.index, colour='#00FFFF', desc='Geocoding Progress'):
    try:
        driver.get(joined_df['url'][ind])
        sleep(3.7)
        url = driver.current_url
        results.append(url)
    except:
        results.append('error')

print('Geocoding complete!')

Geocoding Progress: 100%|[38;2;0;255;255m█████████████████[0m| 1367/1367 [1:47:13<00:00,  4.71s/it][0m

Geocoding complete!





## Post-geocoding processing pt. 1

In [32]:
# extract lat / longs & put into new dataframe
lats = []
longs = []

# parse & split the 'results' list 
for item in range(len(results)):
    try:
        found = re.search('/@(.+?),17z', results[item]).group(1)
        lats.append(found.split(',')[0])
        longs.append(found.split(',')[1])
    except:
        lats.append('error')
        longs.append('error')
        
# now add the parsed & cleaned lat/longs as additional columns 
df = joined_df.copy()
df['lat'] = lats
df['long'] = longs

## Post-geocoding processing pt. 2

In [33]:
# how many addresses will need to be manually geocoded?
manuals = df[df['lat'] == 'error'].shape[0]
total_addresses = df.shape[0]
perc = (manuals / total_addresses) * 100
print(f'Addresses to manually geocode: {manuals} out of {total_addresses}, or only {perc:.1f}% of the total!')

# export to CSV
df.to_csv('GeocodedAddresses.csv')
print("fee-neeshed!")

Addresses to manually geocode: 15758 out of 15764, or only 100.0% of the total!
fee-neeshed!
