# Australian General Practice location scraper

5 October 2021

---

**Description**

It is difficult to find existing datasets for general practice locations in Australia (e.g., from data.gov.au or data.vic.gov.au). Healthdirect appears to be the best official source of health provider locations. 

This scraper contains some basic code to scrape healthdirect.gov.au to get the names and locations of all General Practice sites in Victoria. It can be modified to obtain all those in any other given state, the whole country or just a subset of postcodes.


**Instructions**
- Make sure that pandas, requests and lxml are installed
- Ensure that the reference postcode data is specified correctly
- Run each of the cells in order. 

In [1]:
import requests
import pandas as pd
from lxml import html
from time import sleep

import json

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## 1. Define paths and utility functions

- Specify the required data elements from the website
- Create utility function to extract data elements from web page

In [2]:
# these definesthe elements within the webpage where name and location information is recorded
path_name = './/div[@class="veyron-hsf-page "]/a/@href'
path_address1 = './/div[@class="veyron-hsf-page "]/a/@data-address'
path_lat = './/div[@class="veyron-hsf-page "]/a/@data-lat'
path_long = './/div[@class="veyron-hsf-page "]/a/@data-long'

In [3]:
# for the pages of the individual practices, these are the paths with relevant information
path_address2 = './/p[@class="hsf-service_details-data-address veyron-hsf-full-address"]/@data-full-address'
path_opening_hours_day = './/div[@class="hsf-service_details-data-hours-group"]/ul/li/span[@class="hsf-service_details-data-hours-weekday"]/text()'
path_opening_hours_time = './/div[@class="hsf-service_details-data-hours-group"]/ul/li/span[@class="hsf-service_details-data-hours-times"]/text()'
path_fees = './/p[@class="hsf-service_details-data-billing"]/text()'
path_details = './/p[@class="hsf-service_details-data-description"]/span/text()'

path_javascript = './/script[@type="text/javascript"]/text()'

In [4]:
# the data elements that we are interested in
data_scheme = {'Name': path_name,
               'Address': path_address1,
               'Latitude': path_lat,
               'Longitude': path_long}

In [5]:
def extract_js(js_text, start_marker='[{"', end_marker='}],\n', offset=2):
    """
    Extract the javascript text and convert to dict. This contains lat, long etc
    """
    start_index, end_index = js_text.find(start_marker), js_text.find(end_marker)
    output = js_text[start_index:end_index+offset]
    return json.loads(output)

def get_page_info(url):
    """
    Extract all the info we are interested in for a given URL
    """
    page = requests.get(url)
    tree = html.fromstring(page.text)
    address = tree.xpath(path_address1)
    opening_hours_day = tree.xpath(path_opening_hours_day)
    opening_hours_time = tree.xpath(path_opening_hours_time)
    fees = tree.xpath(path_fees)
    js_text = tree.xpath(path_javascript)
    js_data = extract_js(js_text[9])
    
    details = ' '.join(tree.xpath(path_details)).strip()
    
    latitude = float(js_data[0]['location']['physicalLocation']['geocode']['latitude'])
    longitude = float(js_data[0]['location']['physicalLocation']['geocode']['longitude'])
    
    # create output dataframe
    df_temp = pd.DataFrame(data={'url': url, 'address': address, 'fees': ' '.join(fees).strip(), 
                                 'details': details,
                                 'latitude': latitude, 'longitude': longitude})
    
    return df_temp

def get_page_info(url):
    """
    Extract all the info we are interested in for a given URL
    """
    page = requests.get(url)
    tree = html.fromstring(page.text)
    good_text = tree.xpath('.//script[@type="text/javascript"]/text()')[9]
    good_data = find_json(good_text)
    
    location = pd.json_normalize(good_data['location'])
    billing = pd.json_normalize(good_data['billingOptions'])
    name = good_data['organisation']['name']
    opening_hours = pd.json_normalize(good_data['calendar']['openRule']).stack().reset_index()
    opening_hours[0] = opening_hours[0].astype(str)
    opening_hours.index = opening_hours['level_1'] + opening_hours['level_0'].astype(str)
    opening_hours = opening_hours[[0]].T.reset_index().iloc[:, 1:]
    
    location_cleaned = (
        location[['physicalLocation.addressLine3', 
                  'physicalLocation.postcode',
                  'physicalLocation.suburb.label', 
                  'physicalLocation.state.label', 
                  'physicalLocation.geocode.latitude', 
                  'physicalLocation.geocode.longitude']]
        .rename(columns={'physicalLocation.addressLine3': 'street_address', 
                         'physicalLocation.postcode': 'postcode', 
                         'physicalLocation.suburb.label': 'suburb', 
                         'physicalLocation.state.label': 'state', 
                         'physicallocation.geocode.latitude': 'latitude', 
                         'physicalLocation.geocode.longitude': 'longitude'})
    )
    
    billing_cleaned = billing[['valueType.label']].rename(columns={'valueType.label': 'fees'})
    df_temp = pd.concat([location_cleaned, billing_cleaned, opening_hours], axis=1)
    df_temp['name'] = name

    return df_temp

In [6]:
def get_locations(text, data_scheme):
    """
    Retrieve location information in web page
    
    Args
    ----
    text (str): the scraped data from the webpage containing
    required information
    data_scheme (dict): specifies paths for each data element
    
    Returns
    pd.DataFrame with the parsed data
    """
    tree = html.fromstring(text)
    names = data_scheme.keys()
    output_data = dict()
    
    for name in names:
        path = data_scheme[name]
        item_data = tree.xpath(path)
        output_data[name] = item_data
        
    return pd.DataFrame(output_data)

In [7]:
def find_json(text):
    """
    Given some text that contains JSON, find where it beings and ends
    
    Args
    ----
    text (str): the input text
    
    Return
    ------
    the json text
    
    """
    # initialize starting and ending indices
    start = 0
    end = -1
    
    # initialize the depth inside the JSON nested structure
    current_level = -1
    
    # for each character in the text
    # look for '{' or '}'
    # raise or lower the current_level 
    for n in range(0, len(text)):
        current_char = text[n]
        
        if current_char == '{':
            # if we have not yet seen a '{' then initialize the start index
            # along with the current_level
            if current_level == -1:
                start = n
                current_level = 1
            else:
                current_level += 1
        elif current_char == '}':
            current_level -= 1
            # if we first encounter level 0 then this
            # must be the end of the JSON
            if current_level == 0:
                end = n+1
                break
                
    return json.loads(text[start:end])

## 2. Create list of suburb-postcode pairs

Using reference postcode data, create the suburb-postcode pairs that we require to scrape location data for. This is specified in the ```folder``` and ```filename``` variables.

In [8]:
folder = '/home/alex/Desktop/Data/reference_data'
filename = f'{folder}/postcodes_scraped_221021.csv'

df = pd.read_csv(filename)

In [9]:
# if we are only interested in the VIC postcodes (assuming that that file has a 'state' column)
df_vic = df.query('state == "VIC"')

In [10]:
suburb_postcodes = (
    df
    .query('state=="VIC"') # only want vic suburbs
    .assign(suburb_formatted=df.suburb.str.replace(' ', '_'))
    .assign(suburb_postcode=lambda df_: df_.suburb_formatted.str.lower() + '-' + df_.postcode.astype(str))
    .suburb_postcode.values
)

In [11]:
pages = list(range(1,20)) # how many pages do we scrape for each of the suburb-postcode pairs?

## 3. Create URL list to scrape

Create the list of URLs to scrape based on the suburb-postcode pairs that are required

In [12]:
# the basic url
url_base = 'https://www.healthdirect.gov.au/australian-health-services/results/'
url_base2 = 'https://www.healthdirect.gov.au'
url_middle = '/tihcs-aht-11222/gp-general-practice?pageIndex='
url_end = '&tab=SITE_VISIT'

In [13]:
# create the list of URLs to scrape
urls = [f'{url_base}{suburb_postcode}{url_middle}{n}{url_end}' 
        for suburb_postcode in suburb_postcodes 
        for n in range(1, 5)]

In [14]:
len(suburb_postcodes), len(urls)

(3183, 12732)

## 4. Scrape the data to initial set of data for all the locations

For each of the URLs in the list ```urls``` request the data, parse it and append to list of dataframes ```df_list```.

In [15]:
print(f'There are {len(urls)} URLs.')

There are 12732 URLs.


In [None]:
df_list = []
urls_bad = []

# in case the request returns an error, try again
# this makes the scraper more robust
retry_strategy = Retry(
    total=5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
    backoff=0.5
)

adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

# loop through URLs scrape each one and parse data
for n, url in enumerate(urls[3575:6000]):
    if n % 100 == 0:
        sleep(5)
    print(f'{n}: Scraping URL: {url}')
    page = http.get(url)
    print(f'status code: {page.status_code}')
    if page.status_code == 200:
        output_data = get_locations(page.text, data_scheme)
    else:
        print('Unsuccessful request')
        urls_bad.append(url)
    df_list.append(pd.DataFrame(output_data))

Concatenate all the datasets for each suburb-postcode

In [17]:
df_scraped = pd.concat(df_list).drop_duplicates()

## 5. Tidy up the data

- Remove unneccessary white spaces
- drop duplicates

In [None]:
# clean up the scraped data
df_cleaned = (
    df_scraped
    .assign(Name=df_scraped.Name.str.strip()) # remove unncessary characters from name
    .assign(Address=df_scraped.Address.str.strip()) # remove unncessary characters from Address
    .drop_duplicates()
    .assign(suburb=df_scraped.Address.str.extract(r'(\, )([A-Z]{2,}\s{0,1}[A-Z]+)')[1]) # extract the suburb
    .assign(postcode=df_scraped.Address.str.extract(r'([0-9]{4})\s{0,1}$')) # extract the postcode
    .reset_index()
    .iloc[:, 1:]
)

df_cleaned

In [19]:
output_folder = '/home/alex/Desktop/Data/scraped/gp_locations'

df_final = df_cleaned.drop_duplicates().reset_index().iloc[:, 1:]
df_final.to_csv(f'{output_folder}/gp_scraped_071022_partial1.csv')

## 6. Scrape additional info from each of the practices (optional)

This scrapes the individual URLs of each of the practices to get the following additional information:

- Fees
- Opening hours
- Other details

In [None]:
# for each of the practices, scrape the additional detailed info
df_all_details_temp = []

for url in df_cleaned['Name'].values:
    print(f'\nRetrieving detailed info for {url}')
    url_name = f"{url_base2}{url}"
    df_temp = get_page_info(url_name)
    df_all_details_temp.append(df_temp)
    
df_all_details = pd.concat(df_all_details_temp).drop_duplicates().reset_index().iloc[:, 1:]

## 7. Output this data

In [None]:
df_all_details.to_csv(f'{output_folder}/gp_locations_detailed_info_081022.csv', index=False)