The purpose of this notebook is to extract Post Office data for research purposes only. 

It is in its current form provided to the reviewers of the paper "Geographic and Temporal Access to Basic Banking Services Offered through Post Offices in Wales". 

In [1]:
# we used Python 3.6 with the libraries below
#import python libraries
import requests # this library is for calling the url
from bs4 import BeautifulSoup #this library is for parsing webpages
from tqdm import tqdm #this library is for monitoring the duration of each task
import pandas as pd #data processing
import geopandas as gpd #spatial data processing
import time #we use this library in order to introduce time breaks bc the Post Office interuprt the extraction periodically
import yaml #we used this library to parse string encoded data from html
import logging

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd #data processing


In [2]:
# change this if you want to continue a session that was broken
# to continue a session use the timestamp of the session
# the program will attempt to open the pandas file with the processing data
# for that session and continue running it
# session = '20230105173553'

# use this in case you want to start a brand new session
#session = None
session = '20240601125909'

# seconds delay in running the extract to avoid 
# 429 Too Many Requests
DELAY = 10.0

# save to disk after processing this number of branches details
SAVE_EVERY = 10

In [3]:
#here we create "session" associated with our data scrapping and we create a dataframe named after the session.
#here we get the list of all locations however without the details
if not session:
    # brand new session
    #
    # get the branch list from PO
    url = 'https://www.postoffice.co.uk/all-locations'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    # branch_links = [branch['href'] for branch in soup.find_all('a', class_='bsm-link shah')]
    # changed January 2024
    branch_links = [branch['href'] for branch in soup.find_all('a', class_='atom xsmall-link')]
    df = pd.DataFrame(
        data={
            # changed January 2024
            #'code': [link.split('/')[2] for link in branch_links],
            'code': [link.split('/')[4] for link in branch_links],
            'link': branch_links,
            'processed': [False] * len(branch_links)
        })
    df.set_index('code', inplace=True)
    # generate time stamp
    session = time.strftime('%Y%m%d%H%M%S')
    print(f'New session created: {session}')
    filename = 'PO_raw_' + session +'.pickle'
    df.to_pickle(filename)

In [4]:
#here we open the dataframe and prepare for the detail session
filename = 'PO_raw_' + session +'.pickle'
print(f'Opening session: {session} for processing')
df = pd.read_pickle(filename)
df.info()
logging.basicConfig(filename=f'PO_raw_{session}.log', level=logging.INFO)

Opening session: 20240601125909 for processing
<class 'pandas.core.frame.DataFrame'>
Index: 10936 entries, 1512471 to 2585049
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   link        10936 non-null  object 
 1   processed   10936 non-null  bool   
 2   name        10931 non-null  object 
 3   tel         10931 non-null  object 
 4   street      10931 non-null  object 
 5   city        10931 non-null  object 
 6   postalcode  10931 non-null  object 
 7   country     10931 non-null  object 
 8   latitude    10931 non-null  float64
 9   longitude   10931 non-null  float64
 10  MonOpen     10936 non-null  object 
 11  MonClose    10936 non-null  object 
 12  TueOpen     10936 non-null  object 
 13  TueClose    10936 non-null  object 
 14  WedOpen     10936 non-null  object 
 15  WedClose    10936 non-null  object 
 16  ThuOpen     10936 non-null  object 
 17  ThuClose    10936 non-null  object 
 18  FriOpen     1093

In [5]:
df

Unnamed: 0_level_0,link,processed,name,tel,street,city,postalcode,country,latitude,longitude,...,WedClose,ThuOpen,ThuClose,FriOpen,FriClose,SatOpen,SatClose,services,SunOpen,SunClose
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1512471,https://www.postoffice.co.uk/branch-finder/151...,True,Abberley,0345 611 2970,"Stockton Road, Abberley, Worcester",Worcester,WR6 6AR,GB,52.30395,-2.37739,...,13:00,09:00,17:00,09:00,17:00,09:00,12:30,[],00:00,00:00
0744700,https://www.postoffice.co.uk/branch-finder/074...,True,Abberton,0345 611 2970,"1 Lion Corner, Mersea Road, Langenhoe",Langenhoe,CO5 7LF,GB,51.83430,0.91080,...,00:00,00:00,00:00,14:00,16:00,00:00,00:00,[],00:00,00:00
1950339,https://www.postoffice.co.uk/branch-finder/195...,True,Abbey Avenue,0345 611 2970,"35 Abbey Avenue, St Albans, Hertfordshire",Hertfordshire,AL3 4BH,GB,51.74140,-0.35710,...,19:00,09:00,19:00,09:00,19:00,09:00,19:00,[],09:00,13:00
5593298,https://www.postoffice.co.uk/branch-finder/559...,True,Abbey Drive,0345 611 2970,"50-54 Abbey Drive, Jarrow, Tyne and Wear",Tyne and Wear,NE32 3QG,GB,54.98200,-1.48130,...,23:00,07:00,23:00,07:00,23:00,07:00,23:00,[],07:00,23:00
3363511,https://www.postoffice.co.uk/branch-finder/336...,True,Abbey Farm Drop + Collect,0345 611 2970,"11 Diamond Crescent, Swindon, Wiltshire",Wiltshire,SN25 2SJ,GB,51.60970,-1.79970,...,22:00,07:00,22:00,07:00,22:00,07:00,22:00,[],07:00,22:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756110,https://www.postoffice.co.uk/branch-finder/275...,True,Ystrad Mynach,0345 611 2970,"35 Penallta Road, Ystrad Mynach, Hengoed",Hengoed,CF82 7AP,GB,51.64233,-3.23744,...,17:30,09:00,17:30,09:00,17:30,09:00,12:30,[],00:00,00:00
4746112,https://www.postoffice.co.uk/branch-finder/474...,True,Ystradfellte,0345 611 2970,"St Marys Church Hall, Ystradfellte, Aberdare",Aberdare,CF44 9JE,GB,51.81151,-3.55110,...,00:00,00:00,00:00,00:00,00:00,00:00,00:00,[],00:00,00:00
0036420,https://www.postoffice.co.uk/branch-finder/003...,True,Ystradgynlais,0345 611 2970,"30 Brecon Road, Ystradgynlais, Swansea",Swansea,SA9 1HE,GB,51.77820,-3.75660,...,17:30,09:00,17:30,09:00,17:30,09:00,13:00,[],00:00,00:00
2769093,https://www.postoffice.co.uk/branch-finder/276...,True,Zealand Road,0345 611 2970,"70 Zealand Road, Canterbury, Kent",Kent,CT1 3QB,GB,51.27100,1.07660,...,17:00,08:00,17:00,08:00,17:00,08:00,15:00,[],00:00,00:00


In [6]:
# keeps track of the processed branches for saving SAVE_EVERY processed branches
processed = 0
logging.info(f'(re)processing session {session}')


for i, branch in tqdm(df.iterrows(), total=df.shape[0]):
    
    #if the item has been already processed, it goes to the next one    
    if branch.processed:
        logging.info(f'branch {branch.name} already processes, skipping')
        continue
    
    # handle HTTP Error 429 (Too Many Requests)
    for attempt in range(5):
        # changed January 2024
        #page = requests.get('https://postoffice.co.uk' + branch.link)
        page = requests.get(branch.link)
        if page.status_code != 429:
            break
        else:
            logging.warning('HTTP Error 429 (Too Many Requests) issued by the server. Will now sleep for 1 min.')
            time.sleep(60)
            if DELAY <= 6.0:
                DELAY += 1
                logging.warning(f'Increased DELAY to {DELAY}')
    #if we made 4 attmpts but we are still denied, we wait for 5 mins and skip that branch. It will be re-processed later.
    if attempt == 4:
        logging.warning('Still have HTTP Error 429 after 5 attempts. Will sleep for 5 mins now')
        logging.warning(f'Branch {branch.link} will need to be reprocessed')
        time.sleep(5*60)
        continue

    # get branch details
    soup = BeautifulSoup(page.content, 'html.parser')
    # it searches for a JScript which embeds the details about the branch as a JSON
    data_raw = soup.find_all("script", type="application/ld+json")
    
    if len(data_raw) == 0:
        logging.warning(f'Response for branch {branch.link} is malformed; it will remain empty.')
        data = {}
    else:
        #"data" is the info about the branch. "contents" is the text in this script
        data = yaml.load(data_raw[0].contents[0], Loader=yaml.FullLoader)
        # using BeautifulSoup we find the branch services as a list
        services = soup.find_all(class_='productDetail')

    # update the info in the df
    if 'name' in data:        # there are some recrods that do not have details
        df.at[i, 'name'] = data['name']
        df.at[i, 'tel'] = data['telephone']
        df.at[i, 'street'] = data['address']['streetAddress']
        df.at[i, 'city'] = data['address']['addressLocality']
        df.at[i, 'postalcode'] = data['address']['postalCode']
        df.at[i, 'country'] = data['address']['addressCountry']
        df.at[i, 'latitude'] = data['geo']['latitude']
        df.at[i, 'longitude'] = data['geo']['longitude']
        for hours in data['openingHoursSpecification']:
            day = hours['dayOfWeek'][:3]       # first 3 letters of the day
            df.at[i, day+'Open'] = hours['opens']
            df.at[i, day+'Close'] = hours['closes']
        df.at[i, 'services'] = f'{[service.div.text.strip() for service in services]}'
        # mark the branch as processed
        df.at[i, 'processed'] = True
        logging.info(f'branch {data["name"]} processed successfuly')
    else:
        logging.warning(f'Resoponse for branch {branch.link} is malformed; it will remain empty.')       
    
    # save the dataframe after SAVE_EVERY branches processed
    processed += 1
    if processed % SAVE_EVERY == 0:
        df.to_pickle(filename)
    
    # wait for DELAY seconds to avoid 429 HTTP error (Too Many Requests)
    time.sleep(DELAY)

100%|████████████████████████████████████| 10936/10936 [01:08<00:00, 159.62it/s]


In [7]:
for colum in ['MonOpen', 'MonClose',
              'TueOpen', 'TueClose',
              'WedOpen', 'WedClose',
              'ThuOpen', 'ThuClose',
              'FriOpen', 'FriClose',
              'SatOpen', 'SatClose',
              'SunOpen', 'SunClose',]:
    df[colum] = df[colum].fillna('00:00')
df.to_pickle(filename)

In [8]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs='epsg:4326')

In [9]:
gdf

Unnamed: 0_level_0,link,processed,name,tel,street,city,postalcode,country,latitude,longitude,...,ThuOpen,ThuClose,FriOpen,FriClose,SatOpen,SatClose,services,SunOpen,SunClose,geometry
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1512471,https://www.postoffice.co.uk/branch-finder/151...,True,Abberley,0345 611 2970,"Stockton Road, Abberley, Worcester",Worcester,WR6 6AR,GB,52.30395,-2.37739,...,09:00,17:00,09:00,17:00,09:00,12:30,[],00:00,00:00,POINT (-2.37739 52.30395)
0744700,https://www.postoffice.co.uk/branch-finder/074...,True,Abberton,0345 611 2970,"1 Lion Corner, Mersea Road, Langenhoe",Langenhoe,CO5 7LF,GB,51.83430,0.91080,...,00:00,00:00,14:00,16:00,00:00,00:00,[],00:00,00:00,POINT (0.91080 51.83430)
1950339,https://www.postoffice.co.uk/branch-finder/195...,True,Abbey Avenue,0345 611 2970,"35 Abbey Avenue, St Albans, Hertfordshire",Hertfordshire,AL3 4BH,GB,51.74140,-0.35710,...,09:00,19:00,09:00,19:00,09:00,19:00,[],09:00,13:00,POINT (-0.35710 51.74140)
5593298,https://www.postoffice.co.uk/branch-finder/559...,True,Abbey Drive,0345 611 2970,"50-54 Abbey Drive, Jarrow, Tyne and Wear",Tyne and Wear,NE32 3QG,GB,54.98200,-1.48130,...,07:00,23:00,07:00,23:00,07:00,23:00,[],07:00,23:00,POINT (-1.48130 54.98200)
3363511,https://www.postoffice.co.uk/branch-finder/336...,True,Abbey Farm Drop + Collect,0345 611 2970,"11 Diamond Crescent, Swindon, Wiltshire",Wiltshire,SN25 2SJ,GB,51.60970,-1.79970,...,07:00,22:00,07:00,22:00,07:00,22:00,[],07:00,22:00,POINT (-1.79970 51.60970)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756110,https://www.postoffice.co.uk/branch-finder/275...,True,Ystrad Mynach,0345 611 2970,"35 Penallta Road, Ystrad Mynach, Hengoed",Hengoed,CF82 7AP,GB,51.64233,-3.23744,...,09:00,17:30,09:00,17:30,09:00,12:30,[],00:00,00:00,POINT (-3.23744 51.64233)
4746112,https://www.postoffice.co.uk/branch-finder/474...,True,Ystradfellte,0345 611 2970,"St Marys Church Hall, Ystradfellte, Aberdare",Aberdare,CF44 9JE,GB,51.81151,-3.55110,...,00:00,00:00,00:00,00:00,00:00,00:00,[],00:00,00:00,POINT (-3.55110 51.81151)
0036420,https://www.postoffice.co.uk/branch-finder/003...,True,Ystradgynlais,0345 611 2970,"30 Brecon Road, Ystradgynlais, Swansea",Swansea,SA9 1HE,GB,51.77820,-3.75660,...,09:00,17:30,09:00,17:30,09:00,13:00,[],00:00,00:00,POINT (-3.75660 51.77820)
2769093,https://www.postoffice.co.uk/branch-finder/276...,True,Zealand Road,0345 611 2970,"70 Zealand Road, Canterbury, Kent",Kent,CT1 3QB,GB,51.27100,1.07660,...,08:00,17:00,08:00,17:00,08:00,15:00,[],00:00,00:00,POINT (1.07660 51.27100)


In [10]:
filename = 'PO_raw_' + session

In [11]:
filename

'PO_raw_20240601125909'

In [12]:
gdf.to_file(filename)

In [13]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 10936 entries, 1512471 to 2585049
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   link        10936 non-null  object  
 1   processed   10936 non-null  bool    
 2   name        10934 non-null  object  
 3   tel         10934 non-null  object  
 4   street      10934 non-null  object  
 5   city        10934 non-null  object  
 6   postalcode  10934 non-null  object  
 7   country     10934 non-null  object  
 8   latitude    10934 non-null  float64 
 9   longitude   10934 non-null  float64 
 10  MonOpen     10936 non-null  object  
 11  MonClose    10936 non-null  object  
 12  TueOpen     10936 non-null  object  
 13  TueClose    10936 non-null  object  
 14  WedOpen     10936 non-null  object  
 15  WedClose    10936 non-null  object  
 16  ThuOpen     10936 non-null  object  
 17  ThuClose    10936 non-null  object  
 18  FriOpen     10936 non-null  object 

In [14]:
df.processed.describe()

count     10936
unique        2
top        True
freq      10934
Name: processed, dtype: object