# webscraping

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import time
class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end='')
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)

        return response

    def parse(self, html):
        content = BeautifulSoup(html, 'lxml')

        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]

        
        prices = [price.text.strip() for price in content.findAll('span', {'class': 'propertyCard-priceValue'})]
        if len(prices) == 0:
            prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]

        dates = [date.text.split(' ')[-1] for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        agent = [agent.text.split('by')[-1].strip() for agent in content.findAll('span', {'class': 'propertyCard-branchSummary-branchName'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        apartments = content.find_all("div", class_="l-searchResult is-list")
        agent_phone = [phone_number.text.strip() for phone_number in content.find_all('a', {'class': 'propertyCard-contactsPhoneNumber'}) ]
        for b in content.findAll('h3'):
            try:
                bath = b.findAll('span',{'class':'num-icon num-baths'})[0].text.strip()
            except:
                bath='No Info'

        all_apartment_links=[]
        bathrooms=[]
        for i in range(len(apartments)):

            # tracks which apartment we are on in the page
            apartment_no = apartments[i]

            # append link
            apartment_info = apartment_no.find("a", class_="propertyCard-link")
            link = "https://www.rightmove.co.uk" + apartment_info.attrs["href"]
            # url=link
            # response1=requests.get(url)
            # soup1=BeautifulSoup(response1.text,'lxml')
            # rooms=soup1.findAll('div',{'class':'_3OGW_s5TH6aUqi4uHum5Gy'})
            # bathrooms.append(rooms[2].text.split()[0][1])

            # a[2].text.split()[0][1]
            
            
            all_apartment_links.append(link)
        
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'address': addresses[index],
                'description': descriptions[index],
                'listing_url': all_apartment_links[index],
                'price': prices[index],
                'agent': agent[index],
                'bedrooms':bedrooms[index],
                
            })

    
    def to_dataframe(self):
        return pd.DataFrame(self.results)

    def transform_run(self,url):
        response = self.fetch(url)
        self.parse(response.text)
        df = self.to_dataframe()
        df['listing_source']='Rightmove'
        if 'sale' in url:
            df['Transaction_type'] = 'sale'
        else:
            df['Transaction_type'] = 'rent'

    # # remove 'for sale' from 'title' column
        def convert_date(date_str):
            date_obj = datetime.strptime(date_str, '%d/%m/%Y')
            formatted_date_str = date_obj.strftime('%d %b %Y')
            return formatted_date_str
        
        df['property_type'] = df['title'].apply(lambda x: re.search(r'(?<=bedroom\s).*', x, re.IGNORECASE).group() if re.search(r'(?<=bedroom\s).*', x, re.IGNORECASE) else None)
        df['Postcode District'] = df['address'].str.extract(r'\b([A-Z]{1,2}\d{1,2}[A-Z]?)\b')
        df['property_type']=df.property_type.str.replace('for sale','')
        df['price'] = df['price'].str.replace(',', '')
        df['price'] = df['price'].str.replace('£', '')
        df['price'] = df['price'].str.replace('pcm', '')
        df['bedrooms'] = df['bedrooms'].apply(lambda x: 0 if x == 'Studio' or x=='Studio flat' else x) 
        df.price = df.price.str.replace('POA','')
      
        df['price'] = pd.to_numeric(df['price'])
        df = df.drop_duplicates(keep='first')
        return df

        
        


if __name__ == '__main__':
    scraper = RightmoveScraper()
    for sales_page in range(0, 3):
        index = sales_page * 24
        if index == 0:
            sales_url=f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords='
        elif index!=0:
            sales_url=f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&index={index}&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords='
        sales_data=scraper.transform_run(sales_url)
        time.sleep(random.randint(1, 3))
        
    for rent_page in range(0, 3):
        index = rent_page * 24
        if index == 0:
            rent_url=f'https://www.rightmove.co.uk/property-to-rent/find.html?searchType=RENT&locationIdentifier=REGION%5E87490&insId=1&radius=0.0&minPrice=&maxPrice=&minBedrooms=&maxBedrooms=&displayPropertyType=&maxDaysSinceAdded=&sortByPriceDescending=&_includeLetAgreed=on&primaryDisplayPropertyType=&secondaryDisplayPropertyType=&oldDisplayPropertyType=&oldPrimaryDisplayPropertyType=&letType=&letFurnishType=&houseFlatShare='
        elif index!=0:
            rent_url=f'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&index={index}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords='
        
        rent_data=scraper.transform_run(rent_url)
        time.sleep(random.randint(1, 3))
        
           
    rightmove_data = pd.concat([sales_data, rent_data])
    
    
    
    

    
  

HTTP GET request to URL: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords= | Status code: 200
HTTP GET request to URL: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&index=24&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords= | Status code: 200
HTTP GET request to URL: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&index=48&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords= | Status code: 200
HTTP GET request to URL: https://www.rightmove.co.uk/property-to-rent/find.html?searchType=RENT&locationIdentifier=REGION%5E87490&insId=1&radius=0.0&minPrice=&maxPrice=&minBedrooms=&maxBedrooms=&displayPropertyType=&maxDaysSinceAdded=&sortByPriceDescending=&_includeLetAgreed=on&primaryDisplayPropertyType=&secondaryDisplayPropertyType=&oldDisplayPr

In [50]:
rightmove_data.head(2)

Unnamed: 0,title,address,description,listing_url,price,agent,bedrooms,listing_source,Transaction_type,property_type,Postcode District
0,2 bedroom apartment for sale,"Rossetti Court, Highgate, N6",An extremely spacious two bedroom apartment wi...,https://www.rightmove.co.uk/properties/1348191...,800000.0,"Taylor Gibbs, Highgate",2,Rightmove,sale,apartment,N6
1,5 bedroom apartment for sale,"One Hyde Park, Knightsbridge",An exceptional exclusive five bedroom apartmen...,https://www.rightmove.co.uk/properties/1301776...,60000000.0,"The Cloister, London",5,Rightmove,sale,apartment,


In [1]:
from undetected_chromedriver import Chrome  # Import Chrome from undetected_chromedriver library
import time  # Import the time module
from bs4 import BeautifulSoup  # Import BeautifulSoup from bs4 library
import pandas as pd  # Import pandas library for data manipulation
import re  # Import re library for regular expressions
import random  # Import random library for generating random values

# Create a class named ZooplaScraper
class ZooplaScraper:
    results = []  # Define a results list to store the scraped data

    # Method to parse the HTML content and extract property details
    def parse(self, html):
        print('Scraping...')
        soup = BeautifulSoup(html, 'lxml')  # Create a BeautifulSoup object

        # Extract property prices, titles, addresses, descriptions, and agents using CSS selectors
        prices = [price.text.strip() for price in soup.find_all('p', {'data-testid': "listing-price"})]
        titles = [title.text.strip() for title in soup.find_all('h2', {'data-testid': "listing-title"})]
        addresses = [address.text.strip() for address in soup.findAll('h3', {'class': '_1ankud52 _1ftx2fq9'})]
        descriptions = [description.text.strip() for description in
                        soup.find_all('p', {'class': "_1ankud53 _1ftx2fq9"})]
        agents = [agent['alt'] for agent in soup.findAll('img', {'class': "_12bxhf70"})]

        links = soup.findAll('a', {'class': '_1maljyt1'})  # Find all property links
        all_apartment_links = []  # Create a list to store all property links

        # Extract the property links and append them to the all_apartment_links list
        for link in links:
            all_apartment_links.append('https://www.zoopla.co.uk' + link['href'])

        # Iterate over the property details and add them to the results list
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'address': addresses[index],
                'description': descriptions[index],
                'listing_url': all_apartment_links[index],
                'price': prices[index],
                'agent': agents[index]
            })

    # Method to convert the results list to a DataFrame
    def to_dataframe(self):
        return pd.DataFrame(self.results)

    # Method to run the web scraping process
    def transform_run(self, url):
        chrome = Chrome()  # Create a new instance of Chrome
        chrome.get(url)  # Navigate to the specified URL

        html = chrome.page_source  # Get the HTML content of the page
        self.parse(html)  # Call the parse method to extract property details
        df = self.to_dataframe()  # Convert the results to a DataFrame

        # Perform additional data manipulation and feature extraction on the DataFrame
        df['bedrooms'] = df['title'].apply(lambda x: x.split()[0])
        df['bedrooms'] = df['bedrooms'].apply(lambda x: 0 if x == 'Studio' or x == 'Studio flat' else x)
        df['listing_source'] = 'Zoopla'
        if 'sale' in url:
            df['Transaction_type'] = 'sale'
        else:
            df['Transaction_type'] = 'rent'
        df['title'] = df['title'].str.replace('for sale', '')
        df['property_type'] = df['title'].str.extract(r'bed\s+(.*)', flags=re.IGNORECASE)
        df['Postcode District'] = df['address'].str.extract(r'\b([A-Z]{1,2}\d{1,2}[A-Z]?)\b')
        df['price'] = df['price'].str.replace(',', '')
        df['price'] = df['price'].str.replace('£', '')
        df['price'] = df['price'].str.replace('pcm', '')
        df['price'] = df['price'].str.replace('POA', '')
        df['price'] = pd.to_numeric(df['price'])
        df = df.drop_duplicates(keep='first')
        time.sleep(10)
        chrome.quit()  # Close the browser instance
        return df

scraper = ZooplaScraper()  # Create an instance of the ZooplaScraper class

# Scrape property data for sales
for index in range(0, 2):
    if index == 0:
        sales_url = 'https://www.zoopla.co.uk/for-sale/property/london/?q=London&search_source=home'
    elif index != 0:
        sales_url = f'https://www.zoopla.co.uk/for-sale/property/london/?q=London&search_source=home&pn={index}'
    zoopla_sales_data = scraper.transform_run(sales_url)
    time.sleep(random.randint(1, 3))  # Pause for a random amount of time between requests

# Scrape property data for rentals
for index in range(0, 2):
    if index == 0:
        rent_url = 'https://www.zoopla.co.uk/to-rent/property/london/?q=London&search_source=home'
    elif index != 0:
        rent_url = f'https://www.zoopla.co.uk/to-rent/property/london/?q=London&search_source=home&pn={index}'
    zoopla_rent_data = scraper.transform_run(rent_url)
    time.sleep(random.randint(1, 3))  

zoopla_data = pd.concat([zoopla_sales_data, zoopla_rent_data])  # Concatenate the sales and rentals data


Scraping...
Scraping...
Scraping...
Scraping...


In [3]:
from undetected_chromedriver import Chrome  # Import Chrome from undetected_chromedriver library
import time  # Import the time module
from bs4 import BeautifulSoup  # Import BeautifulSoup from bs4 library
import pandas as pd  # Import pandas library for data manipulation
import re  # Import re library for regular expressions
import random  # Import random library for generating random values

# Create a class named OTM
class OTM:
    results = []  # Define a results list to store the scraped data

    # Method to parse the HTML content and extract property details
    def parse(self, html):
        soup = BeautifulSoup(html, 'lxml')  # Create a BeautifulSoup object

        # Extract property titles, prices, and addresses using CSS selectors
        titles = [title.text for title in soup.findAll('span', {'class': 'title'})]
        prices = [price.text for price in soup.findAll('div', {'class': 'otm-Price'})]
        addresses = [address.text for address in soup.findAll('span', {'class': 'address'})]

        list_of_agents = soup.findAll('div', {'class': 'otm-PropertyCardAgent'})  # Find all agent details
        agents = []  # Create a list to store agent names

        # Extract agent names and append them to the agents list
        for i in list_of_agents:
            agent_name = i.find('small').text
            agent_name = re.sub('\s*Marketed by\s*', '', agent_name)
            agents.append(agent_name)

        links = soup.findAll('a', {'class': 'days-otm'})  # Find all property links
        all_apartment_links = []  # Create a list to store all property links

        # Extract the property links and append them to the all_apartment_links list
        for link in links:
            all_apartment_links.append('https://www.onthemarket.com' + link['href'])

        # Iterate over the property details and add them to the results list
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'address': addresses[index],
                'listing_url': all_apartment_links[index],
                'price': prices[index],
                'agent': agents[index]
            })

    # Method to convert the results list to a DataFrame
    def to_dataframe(self):
        df = pd.DataFrame(self.results)  # Create a DataFrame from the results list
        return df

    # Method to run the web scraping process
    def transform_run(self, url):
        chrome = Chrome()  # Create a new instance of Chrome
        chrome.get(url)  # Navigate to the specified URL

        html = chrome.page_source  # Get the HTML content of the page
        self.parse(html)  # Call the parse method to extract property details
        df = self.to_dataframe()  # Convert the results to a DataFrame

        # Perform additional data manipulation and feature extraction on the DataFrame
        df['bedrooms'] = df.title.apply(lambda x: x.split()[0])
        df['bedrooms'] = df['bedrooms'].apply(lambda x: 0 if x == 'Studio' or x == 'Studio flat' else x)
        df['listing_source'] = 'OTM'
        if 'sale' in url:
            df['Transaction_type'] = 'sale'
        else:
            df['Transaction_type'] = 'rent'
        df['title'] = df['title'].str.replace('for sale', '')
        df['property_type'] = df['title'].apply(
            lambda x: re.search(r'(?<=bedroom\s).*', x, re.IGNORECASE).group() if re.search(r'(?<=bedroom\s).*',
                                                                                            x, re.IGNORECASE) else None)
        df['Postcode District'] = df['address'].str.extract(r'\b([A-Z]{1,2}\d{1,2}[A-Z]?)\b')
        
        df['price'] = df['price'].str.replace('pcm', '')
        df['price'] = df['price'].apply(lambda x: re.search(r"£([\d,]+)", x).group(1).replace(",", "")
                                        if re.search(r"£([\d,]+)", x) else None)
        df['price'] = df['price'].str.replace('pcm', '')
      
        df.insert(2, 'description', '')
        
        chrome.quit()  # Close the browser instance
        return df

otm_scraper = OTM()  # Create an instance of the OTM class

# Scrape property data for sales
for index in range(0, 2):
    if index == 0:
        sales_url = 'https://www.onthemarket.com/for-sale/property/london/?view=grid'
    elif index != 0:
        sales_url = f'https://www.onthemarket.com/for-sale/property/london/?page={index}&view=grid'
    otm_sales_data = otm_scraper.transform_run(sales_url)
    time.sleep(random.randint(1, 3))  # Pause for a random amount of time between requests to avoid overwhelming the website

# Scrape property data for rentals
for index in range(0, 2):
    if index == 0:
        rent_url = 'https://www.onthemarket.com/to-rent/property/london/?view=grid'
    elif index != 0:
        rent_url = f'https://www.onthemarket.com/to-rent/property/london/?page={index}&view=grid'
    otm_rent_data = otm_scraper.transform_run(rent_url)
    time.sleep(random.randint(1, 3))  
otm_data=pd.concat([otm_sales_data,otm_rent_data])

In [10]:
otm_data.head(2)

Unnamed: 0,title,address,description,listing_url,price,agent,bedrooms,listing_source,Transaction_type,property_type,Postcode District
0,2 bedroom apartment,"Lensbury Avenue, Fulham, SW6",,https://www.onthemarket.com/details/13187474/,1000000,Benham & Reeves - Nine Elms,2,OTM,sale,apartment,SW6
1,2 bedroom terraced house,"Tramway Avenue, London N9",,https://www.onthemarket.com/details/13229075/,375000,Davis Estate Agents - London,2,OTM,sale,terraced house,N9


In [11]:
zoopla_data.head(2)

Unnamed: 0,title,address,description,listing_url,price,agent,bedrooms,listing_source,Transaction_type,property_type,Postcode District
0,3 bed end terrace house,"Wills Crescent, Whitton, Hounslow TW3",A good looking three bedroom home which offers...,https://www.zoopla.co.uk/for-sale/details/6466...,549950,Milestone Residential,3,Zoopla,sale,end terrace house,TW3
1,3 bed flat,"Earlham Grove, London E7",Ready to move in - part buy & pay rent on the ...,https://www.zoopla.co.uk/new-homes/details/646...,121250,Imagine Living,3,Zoopla,sale,flat,E7


In [9]:
# sales_data
data=pd.concat([rightmove_data,zoopla_data,otm_data])
# code to drop duplicates records using address, price, agent, bedrooms, property_type, transaction_type and post code distict column
data=da.drop_duplicates(subset=['address','price','agent','bedrooms','property_type','Transaction_type','Postcode District'],keep='first')

data

Unnamed: 0,title,address,description,listing_url,price,agent,bedrooms,listing_source,Transaction_type,property_type,Postcode District
0,2 bedroom apartment for sale,"Priory Close, London",Offered to the market is this spacious two bed...,https://www.rightmove.co.uk/properties/1318195...,325000.0,"haart, Southgate",2,Rightmove,sale,apartment,
1,5 bedroom apartment for sale,"One Hyde Park, Knightsbridge",An exceptional exclusive five bedroom apartmen...,https://www.rightmove.co.uk/properties/1301776...,60000000.0,"The Cloister, London",5,Rightmove,sale,apartment,
2,12 bedroom house for sale,"Mayfair Freehold House, Park Lane Area, W1K",This stunning 12 bedroom (including 3 staff be...,https://www.rightmove.co.uk/properties/1303065...,45000000.0,"Luxury Living Homes International, London",12,Rightmove,sale,house,W1K
3,7 bedroom house for sale,"Lygon Place, Belgravia, SW1W",Ref. LOB0798 - Set behind a gated Belgravia dr...,https://www.rightmove.co.uk/properties/1293029...,45000000.0,"Beauchamp Estates Ltd, Mayfair - Resale",7,Rightmove,sale,house,SW1W
4,6 bedroom house for sale,"Pitt Street, Kensington, W8",Ref. LOB0625 - This striking and unique house ...,https://www.rightmove.co.uk/properties/1318567...,44000000.0,"Beauchamp Estates Ltd, Mayfair - Resale",6,Rightmove,sale,house,W8
...,...,...,...,...,...,...,...,...,...,...,...
83,1 bedroom apartment to rent,"Haydn Tower, 50 Wandsworth Road, SW8",,https://www.onthemarket.com/details/4305420/,3246,CBRE - Battersea and Nine Elms Lettings,1,OTM,rent,apartment to rent,SW8
84,2 bedroom apartment to rent,"Hampton Tower, South Quay Plaza, London, E14",,https://www.onthemarket.com/details/12066733/,3700,London Property Guru - Hammersmith,2,OTM,rent,apartment to rent,E14
85,2 bedroom apartment to rent,Battersea Power Station London SW11,,https://www.onthemarket.com/details/13229224/,3683,JLL - Nine Elms,2,OTM,rent,apartment to rent,SW11
86,1 bedroom flat to rent,"Chapter Road, London, NW2",,https://www.onthemarket.com/details/13229218/,1400,Homeview Estates - Kilburn,1,OTM,rent,flat to rent,NW2
