In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import itertools

from bs4 import BeautifulSoup
import requests
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


In [2]:
# """ 
# rightmove has a page limit on single searches which means the majority of properties will be missed by searching for 'London'.
# Will search individual postcodes instead.
# """

# # Gather all London postcodes

# postcode_url = r'https://www.doogal.co.uk/london_postcodes'
# postcode_page = requests.get(postcode_url)
# soup = BeautifulSoup(postcode_page.content, 'html.parser')

# ldn_pc_tags = soup.find_all('a', href =re.compile(r"UKPostcodes\?."))
# # found all tags containing london postcodes

# ldn_pc = np.array([tag.string for tag in ldn_pc_tags])
# #used regex to keep just the postcode in the tag

# # All postcodes and associated areas are stored in a dataframe for easy access and referencing later.
# pcdf = pd.DataFrame(ldn_pc, columns=['postcode'])
# pcdf = pcdf.postcode.str.split(':', expand=True).rename(columns={0:'postcode', 1:'areas'})

# # Excluding outer London 
# postcodes = pcdf[:155].postcode.values

In [12]:
class PostcodePropertyScraper:
    
    def __init__(self, postcode, for_sale=True):
        self.postcode = postcode
        self.for_sale = for_sale
        self.dix = {'postcode': [], 'address': [], 'price': [], 'type': [], 
                    'bedrooms': [], 'bathrooms': [], 'link': [] } # All data to be stored in dictionary
        
    
#     def add_postcode(self, new_postcode):
#         self.postcode.append(new_postcode)
        
#     def add_location(self, location): # will give postcodes for location
#         pass
         
    

    
    def get_properties(self):
        
        service = Service(r"C:\ChromeDriver\chromedriver.exe")
        driver = webdriver.Chrome(service= service) # need to use a webdriver as the webpages contain javascript

        url = self.get_url()
        driver.get(url)

        button = driver.find_element(By.XPATH, "//*[@id='submit']").click()
        #the search function does not take user immediately to properties but instead gives a refinement page. moreover, this page can't be bypassed as 
        # a property identifier code is used in url instead of the postcode for pages with listed properties; therefore this step must be carried out once for each postcode. 

        url_page1 = driver.current_url
      
        try:
            location_id = re.search(r"(?<=OUTCODE%)\w+(?=&)", url_page1).group(0)
            #second url and so on will always have same format (but different to the first). just need to change locationIdentifier    
        except:
            location_id = re.search(r"(?<=REGION%)\w+(?=&)", url_page1).group(0) 
            #some postcodes come up as a region rather than outcode 

        url = self.get_url(location_id=location_id, iteration=0)
        driver.get(url)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')        
        final_page = int(soup.find("span", {'class': "pagination-pageInfo", 'data-bind' : "text: total"}).string) #using this to determine when iterator should end
        
        
        
        if final_page == 1:
            self.scrape(driver, url)
            return

        
        
        for i in range(final_page-1):
            iteration = i
            url = self.get_url(location_id, iteration)
            self.scrape(driver, url)
            
            if i < final_page - 1:
                print(f'{self.postcode}     pages remaining: {final_page - i-1}', end='                       \r')
                time.sleep(np.random.randint(1,3)*0.1)
            elif i == final_page - 1:
                print("scrape complete.")
            
            
            
            
            
    def scrape(self, driver, url):
        driver.get(url)
        driver.implicitly_wait(10)
        source = driver.page_source

        soup = BeautifulSoup(source, 'html.parser')
        properties = soup.find_all('div', class_ = "l-searchResult is-list")
     

        for prop in properties:

            self.dix['postcode'].append(self.postcode)

            try:
                bathrooms = prop.find('title', text= re.compile(r"(\d+) (?=bath)")).string
            except:  # if num of bathrooms isn't listed
                bathrooms = np.nan
            self.dix['bathrooms'].append(bathrooms)
           
            try:    
                beds = prop.find('title', text= re.compile(r"(\d+) (?=bed)")).string
            except:
                beds = np.nan
            self.dix['bedrooms'].append(beds)

            price = prop.find(class_="propertyCard-priceValue").string
            self.dix['price'].append(price)

            type_ = prop.find('div', class_ = "property-information").find('span').string
            # property type is always the first span element in the banner so this should (hopefully) always return correct info
            self.dix['type'].append(type_)
            
            link = prop.find('a', class_='propertyCard-link')['href']
            self.dix['link'].append(link)

            address = prop.find('address', {'class':"propertyCard-address property-card-updates", 'itemprop':"address"}).find('meta')['content']
            self.dix['address'].append(address)
    
         
        
    def get_data(self):
        df = pd.DataFrame(self.dix)
        df = df.drop_duplicates(subset='link')
        df['pc_area'] = df.postcode.apply(lambda x: re.match("^\D+", x).group(0))
        df.type = df.type.str.lower()
        df['link'] = df.link.apply(lambda x: str('https://www.rightmove.co.uk' + x))
        df['date_accessed'] = time.ctime()

        for col in ['price', 'bedrooms', 'bathrooms']:
            df[col] =  df[col].str.replace("\D+", '', regex=True)
            df[col]= pd.to_numeric( df[col], errors='coerce', downcast='signed').astype('float32')


        df.loc[df['type']=='apartment', 'type'] = 'flat'
        # df.price.apply(lambda x: re.match("\D+", str(x)).group(0))

        #handling nan entries
        df.loc[((df.bathrooms.isna()) | (df.bedrooms.isna()))  & (df.type == 'studio'), ['bedrooms', 'bathrooms']] = 1
        df.loc[(df.bathrooms.isna()) & (df.type == 'flat') & (df.bedrooms==1), 'bathrooms'] = 1
        
        # df['price_per_bedroom'] = (df.price / df.bedrooms).astype('float32')



        # refine property types
        # houses
        df.loc[df.type.str.contains("terrace"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("house(?! boat)"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("detached"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("duplex"), 'type_general'] = 'house'

        # flats
        df.loc[df.type.str.contains("flat"), 'type_general'] = 'flat'
        df.loc[df.type.str.contains("apartment"), 'type_general'] = 'flat'
        
        return df
    
    
    def get_url(self, location_id=None, iteration=None):
            
        if self.for_sale:
            search_type = "for-sale"
        else:
            search_type = "to-rent"
            
        if iteration == None:
            return fr"https://www.rightmove.co.uk/property-{search_type}/search.html?searchLocation={self.postcode}"
        elif iteration == 0:
            return fr"https://www.rightmove.co.uk/property-{search_type}/find.html?locationIdentifier=OUTCODE%{location_id}&numberOfPropertiesPerPage=100"
        else:
            return fr"https://www.rightmove.co.uk/property-{search_type}/find.html?locationIdentifier=OUTCODE%{location_id}&numberOfPropertiesPerPage=100&index={iteration*100}"#"&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords="
            
            
       

In [13]:
jarrow = PostcodePropertyScraper('NE32', for_sale=True)
jarrow.get_properties()

In [16]:
jarrow_df = jarrow.get_data()
jarrow_df

Unnamed: 0,postcode,address,price,type,bedrooms,bathrooms,link,pc_area,date_accessed,type_general
0,NE32,"Newlyn Drive, Jarrow",145000.0,terraced,3.0,1.0,https://www.rightmove.co.uk/properties/1294586...,NE,Sat Nov 26 16:16:50 2022,house
1,NE32,"York Avenue, Jarrow, Tyne and Wear, NE32 5QP",495000.0,detached,4.0,,https://www.rightmove.co.uk/properties/1294197...,NE,Sat Nov 26 16:16:50 2022,house
2,NE32,"York Avenue, Jarrow",350000.0,semi-detached,3.0,1.0,https://www.rightmove.co.uk/properties/1282663...,NE,Sat Nov 26 16:16:50 2022,house
3,NE32,"York Avenue, Jarrow, Tyne and Wear, NE32",315000.0,semi-detached,3.0,1.0,https://www.rightmove.co.uk/properties/1254567...,NE,Sat Nov 26 16:16:50 2022,house
4,NE32,"Bede Burn View, Jarrow, Tyne and Wear, NE32 5PQ",310000.0,bungalow,3.0,,https://www.rightmove.co.uk/properties/1209610...,NE,Sat Nov 26 16:16:50 2022,
...,...,...,...,...,...,...,...,...,...,...
58,NE32,"Inverness Road, Jarrow",54950.0,flat,1.0,1.0,https://www.rightmove.co.uk/properties/8575632...,NE,Sat Nov 26 16:16:50 2022,flat
59,NE32,"Breamish Street, Jarrow, Tyne and Wear, NE32 5SH",40000.0,flat,3.0,,https://www.rightmove.co.uk/properties/1292120...,NE,Sat Nov 26 16:16:50 2022,flat
60,NE32,"St. Pauls Road, Jarrow, Tyne and Wear, NE32 3AS",35000.0,ground flat,2.0,,https://www.rightmove.co.uk/properties/1274935...,NE,Sat Nov 26 16:16:50 2022,flat
61,NE32,"3 & 5 Russell Street, Jarrow, Tyne And Wear, N...",29000.0,end of terrace,4.0,,https://www.rightmove.co.uk/properties/1292130...,NE,Sat Nov 26 16:16:50 2022,house


In [None]:
"""
user inputs list of areas -> outputs properties available to buy or rent



check most expensive area at time of search
do permutation test to check p value


"""
