In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import itertools

from bs4 import BeautifulSoup
import requests
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


In [2]:
# """ 
# rightmove has a page limit on single searches which means the majority of properties will be missed by searching for 'London'.
# Will search individual postcodes instead.
# """

# # Gather all London postcodes

# postcode_url = r'https://www.doogal.co.uk/london_postcodes'
# postcode_page = requests.get(postcode_url)
# soup = BeautifulSoup(postcode_page.content, 'html.parser')

# ldn_pc_tags = soup.find_all('a', href =re.compile(r"UKPostcodes\?."))
# # found all tags containing london postcodes

# ldn_pc = np.array([tag.string for tag in ldn_pc_tags])
# #used regex to keep just the postcode in the tag

# # All postcodes and associated areas are stored in a dataframe for easy access and referencing later.
# pcdf = pd.DataFrame(ldn_pc, columns=['postcode'])
# pcdf = pcdf.postcode.str.split(':', expand=True).rename(columns={0:'postcode', 1:'areas'})

# # Excluding outer London 
# postcodes = pcdf[:155].postcode.values

In [20]:
class PostcodePropertyScraper:
    
    def __init__(self, postcode):
        self.postcode = postcode
        self.dix = {'postcode': [], 'price': [], 'type': [], 'bedrooms': [], 'bathrooms': [], 'link': [], 'address': [] } # All data to be stored in dictionary

    def scrape(self, driver, url):
        driver.get(url)
        driver.implicitly_wait(10)
        source = driver.page_source

        soup = BeautifulSoup(source, 'html.parser')
        props = soup.find_all('div', class_ = "l-searchResult is-list")
     

        for prop in props:

            self.dix['postcode'].append(self.postcode)

            try:
                bathrooms = prop.find('title', text= re.compile(r"(\d+) (?=bath)")).string
            except:  # if num of bathrooms isn't listed
                bathrooms = np.nan

            self.dix['bathrooms'].append(bathrooms)
           
            try:    
                beds = prop.find('title', text= re.compile(r"(\d+) (?=bed)")).string
            except:
                beds = np.nan

            self.dix['bedrooms'].append(beds)

            price = prop.find('span', class_="propertyCard-priceValue").string
           
            self.dix['price'].append(price)

            type_ = prop.find('div', class_ = "property-information").find('span').string
            # property type is always the first span element in the banner so this should (hopefully) always return correct info

            self.dix['type'].append(type_)

            link = prop.find('a', class_='propertyCard-link')['href']

            self.dix['link'].append(link)

            address = prop.find('address', {'class':"propertyCard-address property-card-updates", 'itemprop':"address"}).find('meta')['content']

            self.dix['address'].append(address)


            time.sleep(np.random.randint(1,3)*0.1)
    
    
    def get_properties(self):
        
        service = Service(r"C:\ChromeDriver\chromedriver.exe")
        driver = webdriver.Chrome(service= service) # need to use a webdriver as the webpages contain javascript

        url = fr"https://www.rightmove.co.uk/property-to-rent/search.html?searchLocation={self.postcode}&useLocationIdentifier=false&locationIdentifier=&rent=To+rent"
        driver.get(url)

        button = driver.find_element(By.XPATH, "//*[@id='submit']").click()
        #the search function does not take user immediately to properties but instead gives a refinement page. moreover, this page can't be bypassed as 
        # a property identifier code is used in url instead of the postcode for pages with listed properties; therefore this step must be carried out once for each postcode. 

        url_page1 = driver.current_url
      
        try:
            location_id = re.search(r"(?<=OUTCODE%)\w+(?=&)", url_page1).group(0)
            #second url and so on will always have same format (but different to the first). just need to change locationIdentifier    
        except:
            location_id = re.search(r"(?<=REGION%)\w+(?=&)", url_page1).group(0) 
            #some postcodes come up as a region rather than outcode 

        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        final_page = int(soup.find("span", {'class': "pagination-pageInfo", 'data-bind' : "text: total"}).string) #using this to determine when iterator should end
        
        if final_page == 1:
            url = url_page1
            self.scrape(driver, url)
            return

        for i in range(final_page-1):
            if i == 0:
                url = url_page1
            else:
                # each page shows 24 properties. the url contains the index of the final property of the page. 
                url  = fr"https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=OUTCODE%{location_id}&index={i*24}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords="
           
            self.scrape(driver, url)
            
            if i < final_page:
                print(f'{self.postcode}     pages remaining: {final_page - i-1}', end='                       \r')
            elif i == final_page - 1:
                print("scrape complete.")
                
            


In [21]:
jarrow = PostcodePropertyScraper('NE3')
jarrow.get_properties()


NE3     pages remaining: 1                       

In [22]:
jarrow.dix

{'postcode': ['NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3',
  'NE3'],
 'price': ['£775 pcm',
  '£775 pcm',
  '£699 pcm',
  '£825 pcm',
  '£1,135 pcm',
  '£1,700 pcm',
  '£1,500 pcm',
  '£699 pcm',
  '£1,100 pcm',
  '£1,095 pcm',
  '£1,300 pcm',
  '£900 pcm',
  '£750 pcm',
  '£795 pcm',
  '£900 pcm',
  '£500 pcm',
  '£900 pcm',
  '£725 pcm',
  '£1,195 pcm',
  '£1,100 pcm',
  '£1,100 pcm',
  '£895 pcm',
  '£1,040 pcm',
  '£1,200 pcm',
  '£1,100 pcm',
  '£850 pcm',
  '£675 pcm',
  '£1,200 pcm',
  '£600 pcm',
  '£1,295 pcm',
  '£550 pcm',
  '£1,560 pcm',
  '£775 pcm',
  '£750 pcm',
  '£1,195 pcm',
  '£1,195 pcm',

In [4]:
# df = pd.DataFrame(dix)
# df = df.drop_duplicates(subset='link')
# df['pc_area'] = df.postcode.apply(lambda x: re.match("^\D+", x).group(0))
# df.type = df.type.str.lower()


# for col in ['price', 'bedrooms', 'bathrooms']:
#     df[col] =  df[col].str.replace("\D+", '', regex=True)
#     df[col]= pd.to_numeric( df[col], errors='coerce', downcast='signed').astype('float32')


# df.loc[df['type']=='apartment', 'type'] = 'flat'
# # df.price.apply(lambda x: re.match("\D+", str(x)).group(0))


# df.loc[((df.bathrooms.isna()) | (df.bedrooms.isna()))  & (df.type == 'studio'), ['bedrooms', 'bathrooms']] = 1
# df['price_per_bedroom'] = (df.price / df.bedrooms).astype('float32')
# df.loc[(df.bathrooms.isna()) & (df.type == 'flat') & (df.bedrooms==1), 'bathrooms'] = 1



# # refine property types
# # houses
# df.loc[df.type.str.contains("terrace"), 'type'] = 'house'
# df.loc[df.type.str.contains("house(?! boat)"), 'type'] = 'house'
# df.loc[df.type.str.contains("detached"), 'type'] = 'house'
# df.loc[df.type.str.contains("duplex"), 'type'] = 'house'

# # flats
# df.loc[df.type.str.contains("flat"), 'type'] = 'flat'
# df.loc[df.type.str.contains("apartment"), 'type'] = 'flat'