In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import itertools

from bs4 import BeautifulSoup
import requests
import time
import datetime

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


In [2]:
# """ 
# rightmove has a page limit on single searches which means the majority of properties will be missed by searching for 'London'.
# Will search individual postcodes instead.
# """

# # Gather all London postcodes

# postcode_url = r'https://www.doogal.co.uk/london_postcodes'
# postcode_page = requests.get(postcode_url)
# soup = BeautifulSoup(postcode_page.content, 'html.parser')

# ldn_pc_tags = soup.find_all('a', href =re.compile(r"UKPostcodes\?."))
# # found all tags containing london postcodes

# ldn_pc = np.array([tag.string for tag in ldn_pc_tags])
# #used regex to keep just the postcode in the tag

# # All postcodes and associated areas are stored in a dataframe for easy access and referencing later.
# pcdf = pd.DataFrame(ldn_pc, columns=['postcode'])
# pcdf = pcdf.postcode.str.split(':', expand=True).rename(columns={0:'postcode', 1:'areas'})

# # Excluding outer London 
# postcodes = pcdf[:155].postcode.values

In [13]:
class PostcodePropertyScraper:
    
    def __init__(self, postcode, for_sale=True):
        self.postcode = postcode
        self.for_sale = for_sale
        self.dix = {'postcode': [], 'address': [], 'price': [], 'type': [], 
                    'bedrooms': [], 'bathrooms': [], 'link': [] } # All data to be stored in dictionary
        
        self.df =None
        self.outcode_type = None
    
#     def add_postcode(self, new_postcode):
#         self.postcode.append(new_postcode)
        
#     def add_location(self, location): # will give postcodes for location
#         pass
         
    

    
    def get_properties(self):
        
        """Traverses through each webpage with chrome webdriver."""
        
        service = Service(r"C:\ChromeDriver\chromedriver.exe")
        driver = webdriver.Chrome(service= service) # need to use a webdriver as no. bthrms and bdrms stored in javascript.

        url = self.get_url()
        driver.get(url)

        button = driver.find_element(By.XPATH, "//*[@id='submit']").click()
        #the search function does not take user immediately to properties but instead gives a refinement page. moreover, this page can't be bypassed as 
        # a property identifier code is used in url instead of the postcode for pages with listed properties; therefore this step must be carried out once for each postcode. 

        url_page1 = driver.current_url
      
        try:
            location_id = re.search(r"(?<=OUTCODE%)\w+(?=&)", url_page1).group(0)
            self.outcode_type = 'OUTCODE'
            #second url and so on will always have same format (but different to the first). just need to change locationIdentifier    
        except:
            location_id = re.search(r"(?<=REGION%)\w+(?=&)", url_page1).group(0) 
            self.outcode_type = 'REGION'
            #some postcodes come up as a region rather than outcode 

        url = self.get_url(location_id=location_id, iteration=0)
        driver.get(url)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')        
        final_page = int(soup.find("span", {'class': "pagination-pageInfo", 'data-bind' : "text: total"}).string) #using this to determine when iterator should end
        
        
        
        if final_page == 1:
            self.scrape(driver, url)
            return

        
        
        for i in range(final_page-1):
            iteration = i
            url = self.get_url(location_id, iteration)
            self.scrape(driver, url)
            
            if i < final_page - 1:
                print(f'{self.postcode}     pages remaining: {final_page - i-1}', end='                       \r')
                time.sleep(np.random.randint(1,3)*0.1)
            elif i == final_page - 1:
                print("scrape complete.")
            
            
            
            
            
    def scrape(self, driver, url):
        """Scrapes each property card listed on a page."""
        
        driver.get(url)
        driver.implicitly_wait(10)
        source = driver.page_source

        soup = BeautifulSoup(source, 'html.parser')
        properties = soup.find_all('div', class_ = "l-searchResult is-list")
     

        for prop in properties:

            self.dix['postcode'].append(self.postcode)

            try:
                bathrooms = prop.find('title', text= re.compile(r"(\d+) (?=bath)")).string
            except:  # if num of bathrooms isn't listed
                bathrooms = np.nan
            self.dix['bathrooms'].append(bathrooms)
           
            try:    
                beds = prop.find('title', text= re.compile(r"(\d+) (?=bed)")).string
            except:
                beds = np.nan
            self.dix['bedrooms'].append(beds)

            price = prop.find(class_="propertyCard-priceValue").string
            self.dix['price'].append(price)

            type_ = prop.find('div', class_ = "property-information").find('span').string
            # property type is always the first span element in the banner so this should (hopefully) always return correct info
            self.dix['type'].append(type_)
            
            link = prop.find('a', class_='propertyCard-link')['href']
            self.dix['link'].append(link)

            address = prop.find('address', {'class':"propertyCard-address property-card-updates", 'itemprop':"address"}).find('meta')['content']
            self.dix['address'].append(address)
    
         
        
    def format_data(self):
        """Creates a formatted Dataframe."""
        
    
        self.df = pd.DataFrame(self.dix)
        self.df = self.df.drop_duplicates(subset='link')
        self.df['pc_area'] = self.df.postcode.apply(lambda x: re.match("^\D+", x).group(0))
        self.df.type = self.df.type.str.lower()
        self.df['link'] = self.df.link.apply(lambda x: str('https://www.rightmove.co.uk' + x))
        self.df['date_accessed'] = datetime.date.today()

        for col in ['price', 'bedrooms', 'bathrooms']:
            self.df[col] = self.df[col].str.replace("\D+", '', regex=True)
            self.df[col]= pd.to_numeric( self.df[col], errors='coerce', downcast='signed').astype('float32')


        self.df.loc[df['type']=='apartment', 'type'] = 'flat'
        # df.price.apply(lambda x: re.match("\D+", str(x)).group(0))

        #handling nan entries
        self.df.loc[((self.df.bathrooms.isna()) | (self.df.bedrooms.isna()))  & 
                    (self.df.type == 'studio'), ['bedrooms', 'bathrooms']] = 1
        
        self.df.loc[(self.df.bathrooms.isna()) & (self.df.type == 'flat') & 
                    (self.df.bedrooms==1), 'bathrooms'] = 1
        
        # df['price_per_bedroom'] = (df.price / df.bedrooms).astype('float32')



        # refine property types
        # houses
        #write function for this -- too much repeated code
        def root_property_type(self, pattern, root):
            """Finds root property type of a property.
            
               e.g.: 1)If the listed property type is terrace; house will be returned.
                     2)If listed property is apartment; flat will be returned
                    """
            self.df.loc[self.df.type.str.contains(pattern), 'type_general'] = root
            
        df.loc[df.type.str.contains("terrace"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("house(?! boat)"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("detached"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("duplex"), 'type_general'] = 'house'
        df.loc[df.type.str.contains("bungalow"), 'type_general'] = 'house'

        # flats
        df.loc[df.type.str.contains("flat"), 'type_general'] = 'flat'
        df.loc[df.type.str.contains("apartment"), 'type_general'] = 'flat'
        
        def get_df(self):
            return self.df
    
    
    def get_url(self, location_id=None, iteration=None):
        """Returns url for a given location"""  
        if self.for_sale:
            search_type = "for-sale"
        else:
            search_type = "to-rent"
            
        if iteration == None:
            return fr"https://www.rightmove.co.uk/property-{search_type}/search.html?searchLocation={self.postcode}"
        elif iteration == 0:
            return fr"https://www.rightmove.co.uk/property-{search_type}/find.html?locationIdentifier={self.outcode_type}%{location_id}&numberOfPropertiesPerPage=100"
        
        return fr"https://www.rightmove.co.uk/property-{search_type}/find.html?locationIdentifier={self.outcode_type}%{location_id}&numberOfPropertiesPerPage=100&index={iteration*100}"#"&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords="
            
            
       

In [None]:
# from contextlib import contextmanager

# @contextmanager
# def sqlcontext():
    #opens connection
    #yield db -- insert data into sql db
    #close connection
    
# with sqlcontext as db:
#  db.execute("SELECT * ....")

In [14]:
jarrow = PostcodePropertyScraper('Dunston', for_sale=True)
jarrow.get_properties()

In [25]:
jarrow_df = jarrow.get_data()
# jarrow_df.loc[jarrow_df['type'] == 'bungalow', 'link'].values
jarrow_df.sort_values(by='price', ascending=False).link.values

array(['https://www.rightmove.co.uk/properties/129235994#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/129296768#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/128000681#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/85747683#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/127994480#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/129376193#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/129384284#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/128389418#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/127753010#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/126593606#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/124998386#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/124700141#/?channel=RES_BUY',
       'https://www.rightmove.co.uk/properties/124622516#/?channel=RES_BUY',


In [4]:
class Ex:
    def __init__(self):
        self.x = None
    def change_x(self, inp):
        x = self.x
        x = inp
    def get_x(self):
        return self.x
    
    
inst = Ex()
inst.change_x(5)
print(inst.get_x())

None


In [None]:
"""
user inputs list of areas -> outputs properties available to buy or rent



check most expensive area at time of search
do permutation test to check p value


"""
