# Clean Notebook for Webscraping of Blueground

Clean means: You can start from top an run to bottom without an error

# We load the libraries

In [1]:
# import all the libraries

import time # to pause the code
import requests # to get the content of the website
from bs4 import BeautifulSoup # to parse the html
import re # to use regular expressions
import pandas as pd # to use pandas
import numpy as np # to use numpy

# we create empty dataframes

In [2]:
# create an empty dataframe
df_full = pd.DataFrame()
df_object = pd.DataFrame()
df_search = pd.DataFrame()

# we set up the link and stuff for the loop

Test: Links: 

https://www.theblueground.com/furnished-apartments-london-uk?currency=GBP&language=en&offset=10&items=18

In [3]:
# set up the link to the website

weblink = 'https://www.theblueground.com/furnished-apartments-london-uk?currency=GBP&language=en&'

# set up the first page to scrape
pagesite = 10 # we set it to 10 to test the code

# create an empty list to store the blank slates
blank_slates_lst = [] 

# set the stop condition
stop_loop = "We’re sorry! We can’t seem to find any apartments that match your search." 


## we need the following structure for the dataframe:
+ platform_id	
+ platform	
+ neighbourhood	
+ property_type	
+ bedrooms	
+ bathroom	
+ price_pcm	
+ title	
+ furnished	
+ available_from	
+ size

### List of Neighbourhoods from Blueground

bayswater
bermondsey
bromley-by-bow
camden-town
canary-wharf
charing-cross
chelsea
city-of-london
clerkenwell
covent-garden
croydon
earls-court
farringdon
fitzrovia
fulham
hackney
hammersmith
highgate
holborn
islington
kensington
kentish-town
kings-cross
knightsbridge
ladbroke-grove
limehouse
maida-vale
marylebone
mayfair
notting-hill
old-street
paddington
piccadilly
pimlico
queens-park
shoreditch
soho
south-bank
south-kensington
southwark
stockwell
vauxhall
walthamstow
wandsworth
wapping
waterloo
westminster
whitechapel-brick-lane

-----

# we load all the functions

platform_id --> from Detail-Page

platform --> we can add the Origin-Platform during the Loop

neighbourhood

In [4]:
def get_neighborhoods(bs):
    neighborhood_names = bs.find_all("div", {"class":"name-place"})
    neighborhood_names_lst = (neighborhood_name.get_text() for neighborhood_name in neighborhood_names)
    neighborhood_names_lst = [neighborhood_name.strip() for neighborhood_name in neighborhood_names_lst]
    neighborhood_names_lst = [i.rsplit(',', 1)[-1] for i in neighborhood_names_lst]
    return neighborhood_names_lst

property_type 
on Blueground their is only "Studio" or "Apartment"

> later we have to change the Type  "Jr. Bedroom" to "Studio"

In [5]:
def get_property_type(bs):
    # get the property_type out of the apartment amenities
    lst_name = []
    property_typs = bs.find_all(class_="main-amenities")
    property_typs_lst = [property_type.get_text() for property_type in property_typs]
    property_typs_lst = [property_type.strip() for property_type in property_typs_lst]
    property_typs_lst = [i.split('o', 1)[0] for i in property_typs_lst]
    property_typs_lst = [i.replace('Bedr', 'Bedroom') for i in property_typs_lst]
    property_typs_lst = [i.replace('Studi', 'Studio') for i in property_typs_lst]
    property_typs_lst = [i.rsplit(' ', 1)[-1] for i in property_typs_lst]
    #property_typs_lst = [i.replace('Bedroom', 'Apartment') for i in property_typs_lst]
    return property_typs_lst

bedrooms

In [6]:
def get_bedrooms(bs):
    # get the main-amenities of the apartments
    lst_name = []
    bedrooms = bs.find_all(class_="main-amenities")
    bedrooms_lst = [bedroom.get_text() for bedroom in bedrooms]
    bedrooms_lst = [bedroom.strip() for bedroom in bedrooms_lst]
    bedrooms_lst = [i.split('o', 1)[0] for i in bedrooms_lst]
    bedrooms_lst = [i.replace('Bedr', 'Bedroom') for i in bedrooms_lst]
    bedrooms_lst = [i.replace('Studi', 'Studio') for i in bedrooms_lst]
    return bedrooms_lst

bathroom

In [7]:
def get_bathroom(bs):
    # get the main-amenities of the apartments
    lst_name = []
    bathrooms= bs.find_all(class_="main-amenities")
    bathrooms_lst = [bathroom.get_text() for bathroom in bathrooms]
    bathrooms_lst = [bathroom.strip() for bathroom in bathrooms_lst]
    bathrooms_lst = [i.rsplit('o', 1)[-1] for i in bathrooms_lst]
    bathrooms_lst = [i.replace('m', '') for i in bathrooms_lst]

    return bathrooms_lst

price_pcm (price per month)

In [8]:
def get_price_pcm(bs):
    # get the price per month
    lst_name = []
    prices = bs.find_all(class_= "price__amount")
    for price in prices:
        lst_name.append(
            price.get_text()
                .strip()
        )
    return lst_name

 title

In [9]:
def get_object_title(bs):
    # get the names of all the apartments
    lst_name = []
    object_titles = bs.find_all(class_="listing-name")
    for object_title in object_titles:
        lst_name.append(
            object_title.get_text()
                .strip()
        )
    return lst_name


furnished --> on Blueground we have only furnished Studios/Apartments

available_from

In [10]:
def get_availability(bs):
    # get the availability of the apartments
    lst_availability = []
    availability = bs.find_all(class_="availability__date")
    for avail in availability:
        lst_availability.append(
            avail.get_text()
                .strip()
        )
    return lst_availability


-----

# We need this for extra work

url

In [11]:
def get_url_to_detail_page(bs, maximus):
    url_lst = []
    count = 0
    while count <= int(maximus):
        
        # this will get us the link to the detail page
        class_with_link = bs.find_all(class_="ui-image-carousel")
        # with the [] we can select the elment we want to get
        for a in class_with_link[count].find_all('a', href=True):
            url_lst.append(a['href'])
        count += 1
    #print(url_lst)
    return url_lst

----

# the Loop to grab everything

In [12]:
# https://flexiple.com/python/check-if-list-is-empty-python/
# Solution 3: Using len() function
# The len() function returns the number of items in a list. If the list is empty, it returns 0.
while len(blank_slates_lst) == 0: # start and endpoint of the for-loop
    # pause the loop for 3 seconds to reduce the load on the server
    time.sleep(3)


    # get the content of the website
    page = requests.get(weblink +  f'offset={ pagesite }&items=18')
    # parse the html and save it into a BeautifulSoup instance
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    # create a pandas dataframe for the names and prices
    blueground_dict = {
        #'platform_id', --> we get this from the detail page
        'platform': 'blueground',
        'neighbourhood': get_neighborhoods(bs),
        'property_type': get_property_type(bs),
        'bedrooms': get_bedrooms(bs),
        'bathroom': get_bathroom(bs),
        'price_pcm': get_price_pcm(bs),
        'title': get_object_title(bs),
        'furnished': 'furnished',	
        'available_from': get_availability(bs),
        #'size': , --> we get this from the detail page
        'title': get_object_title(bs)
        }

    # we now have a dataframe, we can use this to get a counter for the URL
    df_page = pd.DataFrame(blueground_dict)

    # we create a variable to store the number of rows in the dataframe
    maximus = df_page.index.max() # we give it the max value of the index


    if np.isnan(maximus):
        break
    else:
        df_page['get_url_to_detail_page'] = pd.Series(get_url_to_detail_page(bs, maximus))
    # we can now add the dataframe to the full dataframe
    df_search = df_search.append(df_page, ignore_index=True)
    
    # check if we reached the end of the pages
    blank_slates = bs.find_all(class_="blank-slate__criteria")
    blank_slates_lst = (blank_slate.get_text() for blank_slate in blank_slates)
    blank_slates_lst = [blank_slate.strip() for blank_slate in blank_slates_lst]
    # print the list to make sure it works
    #print(blank_slates_lst)

    # increase the pagesite by 1
    pagesite += 1

# set the number of rows and column_width  to maximum
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_colwidth', None)
display(df_search);

  df_search = df_search.append(df_page, ignore_index=True)
  df_search = df_search.append(df_page, ignore_index=True)
  df_search = df_search.append(df_page, ignore_index=True)
  df_search = df_search.append(df_page, ignore_index=True)


Unnamed: 0,platform,neighbourhood,property_type,bedrooms,bathroom,price_pcm,title,furnished,available_from,get_url_to_detail_page
0,blueground,Canary Wharf,Bedroom,2 Bedroom,2 Bath,5220,"Harbour Wy.,",furnished,22 May 2023,/furnished-apartments-london-uk/london-canary-wharf-165
1,blueground,Soho,Bedroom,1 Bedroom,1 Bath,3980,"Bateman St,",furnished,30 May 2023,/furnished-apartments-london-uk/london-soho-088
2,blueground,Mayfair,Studio,Studio,1 Bath,3500,"Green St,",furnished,01 Jun 2023,/furnished-apartments-london-uk/london-mayfair-094
3,blueground,Fitzrovia,Bedroom,2 Bedroom,1.5 Bath,4430,"Tottenham Court Rd,",furnished,02 Jun 2023,/furnished-apartments-london-uk/london-fitzrovia-063
4,blueground,Old Street,Bedroom,1 Bedroom,1 Bath,4500,"City Rd,",furnished,02 Jun 2023,/furnished-apartments-london-uk/london-old-street-125
5,blueground,Canary Wharf,Bedroom,2 Bedroom,2 Bath,4630,"Baltimore Wharf,",furnished,03 Jun 2023,/furnished-apartments-london-uk/london-canary-wharf-133
6,blueground,Whitechapel/Brick Lane,Bedroom,1 Bedroom,1 Bath,3180,"Dock St,",furnished,05 Jun 2023,/furnished-apartments-london-uk/london-whitechapel-brick-lane-182
7,blueground,Piccadilly,Bedroom,Jr. 1 Bedroom,1 Bath,4070,"Haymarket,",furnished,06 Jun 2023,/furnished-apartments-london-uk/london-piccadilly-036
8,blueground,Canary Wharf,Bedroom,1 Bedroom,1 Bath,3590,"Marsh Wall, S Quay Square,",furnished,08 Jun 2023,/furnished-apartments-london-uk/london-canary-wharf-139
9,blueground,Vauxhall,Bedroom,1 Bedroom,1 Bath,3260,"St George Wharf,",furnished,11 Jun 2023,/furnished-apartments-london-uk/london-vauxhall-130
