In [3]:
from bs4 import BeautifulSoup
import requests
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

COUNTIES=['Carlow', 'Cavan', 'Clare', 'Cork', 'Donegal', 'Dublin', 'Galway', 'Kerry', 'Kildare', 'Kilkenny', 'Laois', 'Leitrim', 'Limerick', 'Longford', 'Louth', 'Mayo', 'Meath', 'Monaghan', 'Offaly', 'Roscommon', 'Sligo', 'Tipperary', 'Waterford', 'Westmeath', 'Wexford', 'Wicklow']
URL="https://www.propertypriceregister.ie/Website/npsra/PPR/npsra-ppr.nsf/PPR-By-Date&Start={iteration_start}&Query=%5Bdt_execution_date%5D%3E=01/01/{year}%20AND%20%5Bdt_execution_date%5D%3C01/01/{next_year}%20AND%20%5Bdc_county%5D={county}&County={county}&Year={year}&StartMonth=01&EndMonth=12&Address="
HEADERS = {'Content-Type': 'application/x-www-form-urlencoded'}

def scrape_county(county, year, start):
    # Format URL for given county in a given year
    county_url = URL.format(
        iteration_start=str(start),
        year=str(year),
        next_year=str(year+1),
        county=county
    )
    
    # Request formatted URL
    response = requests.get(county_url, headers=HEADERS, verify=False)
    
    # Parse HTML return with bs4 library
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <script> tags with type="text/javascript"
    script_tags = soup.find_all('script', {'type': 'text/javascript'})
    
    # All the results are listed in this javascript variable
    search_result_text = 'var dataSearchResults = '
    
    scraped_data = []
    for script in script_tags:
        script_text = script.string
        if script_text and search_result_text in script_text:
            for l in script_text.strip().split('\n'):
                if search_result_text in l:
                    data_list = l.replace(search_result_text, '').strip()
                    # Since javascript and python arrays are equal, evaluate javascript directly
                    scraped_data = eval(data_list[:-1])
                    print('Scraped %s county in %d year: %d found so far.' % (county, year, start-1+len(scraped_data)))
    return scraped_data

def fix_data(data, county, year):
    # Return none if javascript array is in wrong format
    if len(data) != 3:
        return None
    if len(data[2].split('>')) == 1:
        return None
    sale_date = data[0]
    # Only take numeric and thousands part of the price string
    sale_price = int(''.join([c for c in data[1].split('.')[0] if c.isnumeric()]))
    # Javascript returned address contains a URL, but i only need the address text inside the tag
    sale_address = data[2].split('>')[1].replace(', </a', '').strip()
    return (sale_date, year, county, sale_price, sale_address)
    
all_data = []
for year in range(2010, 2024):
    for county in COUNTIES:
        start = 1
        while True:
            scraped = scrape_county(county, year, start)
            # Fix javascript returned list to the format i need
            for data in scraped:
                fixed_data = fix_data(data, county, year)
                if fixed_data is not None:
                    all_data.append(fixed_data)
            # If page dont contain 250 data, then it is the last page
            if len(scraped) != 250:
                break
            start += 250

Scraped Carlow county in 2010 year: 231 found so far.
Scraped Cavan county in 2010 year: 250 found so far.
Scraped Cavan county in 2010 year: 302 found so far.
Scraped Clare county in 2010 year: 250 found so far.
Scraped Clare county in 2010 year: 500 found so far.
Scraped Clare county in 2010 year: 506 found so far.
Scraped Cork county in 2010 year: 250 found so far.
Scraped Cork county in 2010 year: 500 found so far.
Scraped Cork county in 2010 year: 750 found so far.
Scraped Cork county in 2010 year: 1000 found so far.
Scraped Cork county in 2010 year: 1250 found so far.
Scraped Cork county in 2010 year: 1500 found so far.
Scraped Cork county in 2010 year: 1750 found so far.
Scraped Cork county in 2010 year: 2000 found so far.
Scraped Cork county in 2010 year: 2250 found so far.
Scraped Cork county in 2010 year: 2271 found so far.
Scraped Donegal county in 2010 year: 250 found so far.
Scraped Donegal county in 2010 year: 500 found so far.
Scraped Donegal county in 2010 year: 635 fou

In [4]:
df = pd.DataFrame(np.array(all_data), columns=('Full Date', 'Year', 'County', 'Price', 'Full Address'))

In [5]:
df.sample(20)

Unnamed: 0,Full Date,Year,County,Price,Full Address
216921,16/12/2016,2016,Kildare,295153,"37 The Crescent, Piper's Hill, Killashee"
68907,25/04/2013,2013,Cork,217000,"17 Amberley Lawn, Grange, Douglas"
323399,20/12/2018,2018,Kerry,150000,"SCARTAGLEN, KILLARNEY, CO KERRY"
268400,14/08/2017,2017,Kerry,120000,"20 MEELISH CLOSE, JOHN B KEANE RD, LISTOWEL"
326402,31/08/2018,2018,Kildare,262000,"26 BALLYMANY MANOR, NEWBRIDGE, KILDARE"
524424,25/03/2022,2022,Donegal,85000,"11 OAKFIELD CRESCENT, BUNCRANA, CO DONEGAL, F9..."
154882,07/08/2015,2015,Dublin,340000,"6 WESTON COURT, LUCAN, DUBLIN"
507572,22/06/2021,2021,Waterford,146696,"58 Mount William, Williamstown, Waterford"
324143,30/07/2018,2018,Kerry,33750,"BISHOPSCOURT, BALLYDUFF, TRALEE"
295265,11/12/2018,2018,Cork,235000,"7 WELLINGTON PLACE, SUNDAYS WELL RD, CORK"


In [6]:
df.shape

(612233, 5)

In [7]:
df.to_csv('property_sales.csv')

In [10]:
df.describe


<bound method NDFrame.describe of          Full Date  Year   County   Price  \
0       23/12/2010  2010   Carlow  140088   
1       23/12/2010  2010   Carlow  149000   
2       22/12/2010  2010   Carlow  155700   
3       21/12/2010  2010   Carlow  215859   
4       21/12/2010  2010   Carlow  220264   
...            ...   ...      ...     ...   
612228  05/01/2023  2023  Wicklow  390000   
612229  05/01/2023  2023  Wicklow  350000   
612230  04/01/2023  2023  Wicklow  635000   
612231  04/01/2023  2023  Wicklow  300000   
612232  04/01/2023  2023  Wicklow  140000   

                                             Full Address  
0       108 Browneshill Wood, Browneshill Road, Carlow...  
1                41 Gort Na Greine, Ballinabranna, Carlow  
2                  84 Sandhills, Hacketstown Road, Carlow  
3                    31 De Lacey Abbey, Rathvilly, Carlow  
4                    34 De Lacey Abbey, Rathvilly, Carlow  
...                                                   ...  
61222

In [11]:
df.info

<bound method DataFrame.info of          Full Date  Year   County   Price  \
0       23/12/2010  2010   Carlow  140088   
1       23/12/2010  2010   Carlow  149000   
2       22/12/2010  2010   Carlow  155700   
3       21/12/2010  2010   Carlow  215859   
4       21/12/2010  2010   Carlow  220264   
...            ...   ...      ...     ...   
612228  05/01/2023  2023  Wicklow  390000   
612229  05/01/2023  2023  Wicklow  350000   
612230  04/01/2023  2023  Wicklow  635000   
612231  04/01/2023  2023  Wicklow  300000   
612232  04/01/2023  2023  Wicklow  140000   

                                             Full Address  
0       108 Browneshill Wood, Browneshill Road, Carlow...  
1                41 Gort Na Greine, Ballinabranna, Carlow  
2                  84 Sandhills, Hacketstown Road, Carlow  
3                    31 De Lacey Abbey, Rathvilly, Carlow  
4                    34 De Lacey Abbey, Rathvilly, Carlow  
...                                                   ...  
612228 

In [12]:
df.dtypes

Full Date       object
Year            object
County          object
Price           object
Full Address    object
dtype: object