### This notebook perfoms the following tasks:
##### 1) read Historical-Settlement-Reports excel file from DATA.GOV.AU
##### 2) ingest a selected report(sheet) into pandas dataframe
##### 3) GeoCode LGA with lat,long using Google Maps Geocoding API. Note you will need to use your own Google API key; the code performance depends on whether you use free test API key or commercial one.
##### 4) Visualise the results

In [582]:
import pandas as pd
import re
import requests

In [583]:
#Google Maps Geocoding API key. 
#Please request one from https://developers.google.com/maps/documentation/geocoding/start

api_key = 'AIzaSyBVN5lt3AWWVN7dnoZse38ymLuS3pjM8p4'

In [584]:
file='https://data.gov.au/dataset/8d1b90a9-a4d7-4b10-ad6a-8273722c8628/resource/bea8a0d4-4d63-42bf-9325-ac1cd0d50009/download/settlement-data-reports-january-2017-to-30-september-2017.xlsx'

In [585]:
#read the file into dataframe
report = pd.ExcelFile(file)

In [586]:
#define a function to enumerate all elements of a list
def list_all_elements_of_a_list(list_name):
    for i in range (0,len(list_name)):
        print(i,":",list_name[i])

In [587]:
#list all sheets of the excel file
list_all_elements_of_a_list(report.sheet_names)

0 : Current States
1 : Age Group
2 : Country of Birth
3 : Ethnicity
4 : Gender
5 : Local Government Area
6 : Main Language
7 : Religion
8 : Caveats
9 : Settlement Date


In [588]:
# read a specific sheet to DataFrame. We are interested in "Local Government Area" sheet.
sheet_id = 5

#skip the first 11 rows in the Excel sheet
skip_rows = 11 

#skip the first 11 rows in the Excel sheet
read_columns = [0,1,2,3,4,5]

In [589]:
#ingest sheet into dataframe
lga = report.parse(report.sheet_names[sheet_id], skiprows=skip_rows, usecols=read_columns, skip_footer=1)

In [590]:
#replace spaces in columns' header with _
lga.columns = lga.columns.str.replace('\s+', '_')

In [591]:
#Extract exact suburb name. Also convert 'Unincorporated <state>' into <state>.

def get_state(lga_name):
    state_name =  re.search('^.*(?=(\s\())', lga_name)
    unincorp_name =  re.search('Unincorporated', lga_name)
    
    # If state_name exists, extract and return it.
    if state_name:
        return state_name.group()
    # If lga_name conctains "Unicorporated", extract and return only State name.
    elif unincorp_name:
        return re.search('[^ ]* (.*)', lga_name).group(1)
    # in all other cases return original value
    else:
        return lga_name

In [592]:
lga['LGA'] = lga['Local_Government_Area'].apply(get_state)

In [593]:
lga

Unnamed: 0,Local_Government_Area,Humanitarian,Family,Skilled,Grand_Total,%_Total,LGA
0,Not Recorded,174,9082,13219,22475,0.113593,Not Recorded
1,Brisbane (C),469,3372,7256,11098,0.056092,Brisbane
2,Melbourne (C),53,784,4058,4895,0.024740,Melbourne
3,Parramatta (C),288,1097,3094,4479,0.022638,Parramatta
4,Sydney (C),15,1142,3311,4468,0.022582,Sydney
5,Fairfield (C),2425,1224,270,3919,0.019807,Fairfield
6,Wyndham (C),291,1037,2242,3570,0.018044,Wyndham
7,Unincorporated ACT,164,933,2135,3232,0.016335,ACT
8,Blacktown (C),353,1110,1739,3202,0.016184,Blacktown
9,Gold Coast (C),64,1319,1713,3096,0.015648,Gold Coast


In [594]:
#define function to Geocode suburbs using Google Maps Geocode API
#lenth of API response is used as a criteria to identify bad requests, e.g. for non-existent addresses.

def get_coord(lga_name):

    url = r'https://maps.googleapis.com/maps/api/geocode/json?address='+lga_name+'+Australia&key='+api_key
    response = requests.get(url)
    if len(response.content) > 100:
        json_data = pd.read_json(response.content, orient='columns')
        api_resp=pd.io.json.json_normalize(json_data.results)
        api_resp.columns = api_resp.columns.str.replace('\.+', '_')
        temp = pd.DataFrame(api_resp)
        return [temp.geometry_location_lat[0], temp.geometry_location_lng[0], temp.formatted_address[0]]
    else: 
        return 'Bad_address'

In [595]:
#Iterate through the list of LGAs and for each LGA define Lat, Lng and formatted address

row_counter=0

for row in lga.itertuples():
    geocode_api_resp = get_coord(row.LGA)
    if geocode_api_resp != 'Bad_address':
        lga.at[row_counter, 'Lat'] = geocode_api_resp[0]
        lga.at[row_counter, 'Lng'] = geocode_api_resp[1]
        lga.at[row_counter, 'Formatted_address'] = geocode_api_resp[2]
    else:
        lga.at[row_counter, 'Lat'] = geocode_api_resp
        lga.at[row_counter, 'Lng'] = geocode_api_resp
        lga.at[row_counter, 'Formatted_address'] = geocode_api_resp
    row_counter += 1

In [596]:
lga

Unnamed: 0,Local_Government_Area,Humanitarian,Family,Skilled,Grand_Total,%_Total,LGA,Lat,Lng,Formatted_address
0,Not Recorded,174,9082,13219,22475,0.113593,Not Recorded,Bad_address,Bad_address,Bad_address
1,Brisbane (C),469,3372,7256,11098,0.056092,Brisbane,-27.4698,153.025,"Brisbane QLD, Australia"
2,Melbourne (C),53,784,4058,4895,0.024740,Melbourne,-37.8136,144.963,"Melbourne VIC, Australia"
3,Parramatta (C),288,1097,3094,4479,0.022638,Parramatta,-33.815,151.001,"Parramatta NSW 2150, Australia"
4,Sydney (C),15,1142,3311,4468,0.022582,Sydney,-33.8688,151.209,"Sydney NSW, Australia"
5,Fairfield (C),2425,1224,270,3919,0.019807,Fairfield,-33.8703,150.956,"Fairfield NSW 2165, Australia"
6,Wyndham (C),291,1037,2242,3570,0.018044,Wyndham,-15.4825,128.123,"Wyndham WA 6740, Australia"
7,Unincorporated ACT,164,933,2135,3232,0.016335,ACT,-35.4735,149.012,"Australian Capital Territory, Australia"
8,Blacktown (C),353,1110,1739,3202,0.016184,Blacktown,-33.771,150.906,"Blacktown NSW 2148, Australia"
9,Gold Coast (C),64,1319,1713,3096,0.015648,Gold Coast,-28.0167,153.4,"Gold Coast QLD, Australia"


In [597]:
lga.to_csv('lga.csv') #write results into csv