From a308fb5797a3348280378930598fc0629038fc45 Mon Sep 17 00:00:00 2001 From: Nawwar Elnarsh Date: Mon, 13 Jan 2020 23:19:33 +0100 Subject: [PATCH] Flush file after each write & catch errored addresses to another output file --- .gitignore | 3 + address_transformation.py | 157 +++++++++++++++++++++++--------------- 2 files changed, 97 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 0b4563a..fff6a28 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ .Trashes ehthumbs.db Thumbs.db +.idea input.csv +found*.csv +failed*.csv output*.csv \ No newline at end of file diff --git a/address_transformation.py b/address_transformation.py index d936e43..b1368a7 100644 --- a/address_transformation.py +++ b/address_transformation.py @@ -1,75 +1,106 @@ import requests -import json import csv import time from tqdm import * -def addresses_from_csv(path=None, idColumn=None, addrColumn=None): - - addresses = [] +# Settings +google_maps_api_key = '__REPLACE_GOOGLE_API_KEY_HERE__' +input_path = 'input.csv' +id_column = 0 +addr_column = 1 +reader_delimiter = ';' +reader_quoting = csv.QUOTE_NONE - with open(path, 'r') as f: - reader = csv.reader(f, delimiter=';', quoting=csv.QUOTE_NONE) - first_row = next(reader) - for row in reader: - addresses.append([row[idColumn],row[addrColumn]]) - # print(row[idColumn], row[addrColumn]) - - return addresses - -# Get addresses from CSV -addresses = addresses_from_csv(path='input.csv', idColumn=0, addrColumn=1) -#print(addresses) +# Get all addresses from CSV +addresses = [] +with open(input_path, 'r') as f: + reader = csv.reader(f, delimiter=reader_delimiter, quoting=reader_quoting) -# Set Google Maps API key -api_key = 'YOUR_API_KEY' + # Check and ignore first line + first_row = next(reader) + if first_row[0] != 'id': + print('Warning: First line is ignored, it should have value of "id;address"') -# Initialize array for transformed addresses -transformed = [] -transformed.append(['ID', 'Street', 'Street (Short)', 'Number', 'Post code', 'City', 'State', 'State (Short)', 'Country']) + for row in reader: + addresses.append([row[id_column], row[addr_column]]) + + +# Open files for writing +time_str = time.strftime('%Y%m%d-%H%M%S') +file_found = open('found_' + time_str + '.csv', 'w', newline='\n', encoding='utf-8') +file_failed = open('failed_' + time_str + '.csv', 'w', newline='\n', encoding='utf-8') + +# Initiate csv writer +writer_found = csv.writer(file_found, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\') +writer_failed = csv.writer(file_failed, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\') + +# Set headers +writer_found.writerow(['ID', 'Street', 'Street (Short)', 'Number', 'Post code', 'City', 'State', 'State (Short)', 'Country']) +writer_failed.writerow(['ID', 'Failure Reason', 'Address']) + + +# Loop throw addresses, create request and write to files +fails_count = 0 +success_count = 0 +total_addresses = len(addresses) for query in tqdm(addresses): - - # API call, storing information as JSON - url = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + query[1] + '&lang=en&key=' + api_key - r = requests.get(url) - data = r.json() - #print(data) - - # clear all values to avoid appending values from previous iterations a second time - number = street_short = street = country = postal_code = city = state = state_short = '' - - if data['status'] == 'ZERO_RESULTS': - transformed.append([query[0], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']) - print(url) - print(data) - else: - # looping over address components in JSON - for component in data['results'][0]['address_components']: - if 'street_number' in component['types']: - number = component['long_name'] - elif 'route' in component['types']: - street = component['long_name'] - street_short = component['short_name'] - elif 'country' in component['types']: - country = component['long_name'] - elif 'administrative_area_level_1' in component['types']: - state = component['long_name'] - state_short = component['short_name'] - elif 'postal_code' in component['types']: - postal_code = component['long_name'] - elif 'locality' in component['types']: - city = component['long_name'] - elif 'postal_town' in component['types']: - city = component['long_name'] - else: - continue + if len(query) != 2: + reason = 'Expected two columns but found ' + str(len(query)) + writer_failed.writerow([query[0], reason, '']) + file_failed.flush() + print('Skipping ' + query[0] + ' ' + reason) + fails_count += 1 + + try: + # API call, storing information as JSON + url = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + query[1] + '&lang=en&key=' + google_maps_api_key + r = requests.get(url, timeout=15) + data = r.json() + + # clear all values to avoid appending values from previous iterations a second time + number = street = country = postal_code = city = street_short = state = state_short = '' - transformed.append([query[0], street, street_short, number, postal_code, city, state, state_short, country]) - -with open('output_' + time.strftime('%Y%m%d-%H%M%S') + '.csv', 'w', newline='', encoding='utf-8') as f: - writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_ALL, quotechar='"') - for row in transformed: - writer.writerow(row) + if data['status'] == 'ZERO_RESULTS': + writer_failed.writerow([query[0], 'ZERO_RESULT', query[1]]) + print('No result found for #' + query[0]) + file_failed.flush() + print(url) + print(data) + fails_count += 1 + else: + # looping over address components in JSON + for component in data['results'][0]['address_components']: + if 'street_number' in component['types']: + number = component['long_name'] + elif 'route' in component['types']: + street = component['long_name'] + street_short = component['short_name'] + elif 'country' in component['types']: + country = component['long_name'] + elif 'administrative_area_level_1' in component['types']: + state = component['long_name'] + state_short = component['short_name'] + elif 'postal_code' in component['types']: + postal_code = component['long_name'] + elif 'locality' in component['types']: + city = component['long_name'] + elif 'postal_town' in component['types']: + city = component['long_name'] + else: + continue + writer_found.writerow([query[0], street, street_short, number, postal_code, city, state, state_short, country]) + file_found.flush() + success_count += 1 + except: + writer_failed.writerow([query[0], 'Exception', query[1]]) + print('Exception while trying to get #' + query[0]) + file_failed.flush() + fails_count += 1 -print('Done') \ No newline at end of file +print('############') +print('# Found: ' + str(success_count)) +print('# Failed: ' + str(fails_count)) +print('# Total: ' + str(total_addresses)) +print('############') +print('Done')