Skip to content

Commit

Permalink
Flush file after each write & catch errored addresses to another outp…
Browse files Browse the repository at this point in the history
…ut file
  • Loading branch information
SirNarsh committed Jan 13, 2020
1 parent 63dfbf3 commit a308fb5
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 63 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
.Trashes
ehthumbs.db
Thumbs.db
.idea

input.csv
found*.csv
failed*.csv
output*.csv
157 changes: 94 additions & 63 deletions address_transformation.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,106 @@
import requests
import json
import csv
import time
from tqdm import *

def addresses_from_csv(path=None, idColumn=None, addrColumn=None):

addresses = []
# Settings
google_maps_api_key = '__REPLACE_GOOGLE_API_KEY_HERE__'
input_path = 'input.csv'
id_column = 0
addr_column = 1
reader_delimiter = ';'
reader_quoting = csv.QUOTE_NONE

with open(path, 'r') as f:
reader = csv.reader(f, delimiter=';', quoting=csv.QUOTE_NONE)
first_row = next(reader)
for row in reader:
addresses.append([row[idColumn],row[addrColumn]])
# print(row[idColumn], row[addrColumn])

return addresses

# Get addresses from CSV
addresses = addresses_from_csv(path='input.csv', idColumn=0, addrColumn=1)
#print(addresses)
# Get all addresses from CSV
addresses = []
with open(input_path, 'r') as f:
reader = csv.reader(f, delimiter=reader_delimiter, quoting=reader_quoting)

# Set Google Maps API key
api_key = 'YOUR_API_KEY'
# Check and ignore first line
first_row = next(reader)
if first_row[0] != 'id':
print('Warning: First line is ignored, it should have value of "id;address"')

# Initialize array for transformed addresses
transformed = []
transformed.append(['ID', 'Street', 'Street (Short)', 'Number', 'Post code', 'City', 'State', 'State (Short)', 'Country'])
for row in reader:
addresses.append([row[id_column], row[addr_column]])


# Open files for writing
time_str = time.strftime('%Y%m%d-%H%M%S')
file_found = open('found_' + time_str + '.csv', 'w', newline='\n', encoding='utf-8')
file_failed = open('failed_' + time_str + '.csv', 'w', newline='\n', encoding='utf-8')

# Initiate csv writer
writer_found = csv.writer(file_found, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')
writer_failed = csv.writer(file_failed, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')

# Set headers
writer_found.writerow(['ID', 'Street', 'Street (Short)', 'Number', 'Post code', 'City', 'State', 'State (Short)', 'Country'])
writer_failed.writerow(['ID', 'Failure Reason', 'Address'])


# Loop throw addresses, create request and write to files
fails_count = 0
success_count = 0
total_addresses = len(addresses)

for query in tqdm(addresses):

# API call, storing information as JSON
url = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + query[1] + '&lang=en&key=' + api_key
r = requests.get(url)
data = r.json()
#print(data)

# clear all values to avoid appending values from previous iterations a second time
number = street_short = street = country = postal_code = city = state = state_short = ''

if data['status'] == 'ZERO_RESULTS':
transformed.append([query[0], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'])
print(url)
print(data)
else:
# looping over address components in JSON
for component in data['results'][0]['address_components']:
if 'street_number' in component['types']:
number = component['long_name']
elif 'route' in component['types']:
street = component['long_name']
street_short = component['short_name']
elif 'country' in component['types']:
country = component['long_name']
elif 'administrative_area_level_1' in component['types']:
state = component['long_name']
state_short = component['short_name']
elif 'postal_code' in component['types']:
postal_code = component['long_name']
elif 'locality' in component['types']:
city = component['long_name']
elif 'postal_town' in component['types']:
city = component['long_name']
else:
continue
if len(query) != 2:
reason = 'Expected two columns but found ' + str(len(query))
writer_failed.writerow([query[0], reason, ''])
file_failed.flush()
print('Skipping ' + query[0] + ' ' + reason)
fails_count += 1

try:
# API call, storing information as JSON
url = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + query[1] + '&lang=en&key=' + google_maps_api_key
r = requests.get(url, timeout=15)
data = r.json()

# clear all values to avoid appending values from previous iterations a second time
number = street = country = postal_code = city = street_short = state = state_short = ''

transformed.append([query[0], street, street_short, number, postal_code, city, state, state_short, country])

with open('output_' + time.strftime('%Y%m%d-%H%M%S') + '.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"')
for row in transformed:
writer.writerow(row)
if data['status'] == 'ZERO_RESULTS':
writer_failed.writerow([query[0], 'ZERO_RESULT', query[1]])
print('No result found for #' + query[0])
file_failed.flush()
print(url)
print(data)
fails_count += 1
else:
# looping over address components in JSON
for component in data['results'][0]['address_components']:
if 'street_number' in component['types']:
number = component['long_name']
elif 'route' in component['types']:
street = component['long_name']
street_short = component['short_name']
elif 'country' in component['types']:
country = component['long_name']
elif 'administrative_area_level_1' in component['types']:
state = component['long_name']
state_short = component['short_name']
elif 'postal_code' in component['types']:
postal_code = component['long_name']
elif 'locality' in component['types']:
city = component['long_name']
elif 'postal_town' in component['types']:
city = component['long_name']
else:
continue
writer_found.writerow([query[0], street, street_short, number, postal_code, city, state, state_short, country])
file_found.flush()
success_count += 1
except:
writer_failed.writerow([query[0], 'Exception', query[1]])
print('Exception while trying to get #' + query[0])
file_failed.flush()
fails_count += 1

print('Done')
print('############')
print('# Found: ' + str(success_count))
print('# Failed: ' + str(fails_count))
print('# Total: ' + str(total_addresses))
print('############')
print('Done')

0 comments on commit a308fb5

Please sign in to comment.