## Import libraries

In [None]:
import time
import random

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pandas as pd
from datetime import datetime
import math 

from sql_functions import *    

import psycopg2    

# create global list of targeted neighbourhoods AND match with platform location IDs
neighbourhoods = ['Hammersmith and Fulham', 'Kensington and Chelsea', 'Camden', 'City of Westminster', 'City of London', 'Hackney', 'Lambeth', 'Tower of Hamlets', 'Islington']
rightmove_loc_id = ['5E61407', '5E61229', '5E93941', '5E61233', '5E61224', '5E93953', '5E93971', '5E61417', '5E93965']
zip_iterator = zip(rightmove_loc_id, neighbourhoods)
locations_dict = dict(zip_iterator)
    # {
    # '5E61407': 'Hammersmith and Fulham',
    # '5E61229': 'Kensington and Chelsea',
    # '5E93941': 'Camden',
    # '5E61233': 'City of Westminster',
    # '5E61224': 'City of London',
    # '5E93953': 'Hackney',
    # '5E93971': 'Lambeth',
    # '5E61417': 'Tower of Hamlets',
    # '5E93965': 'Islington'
    # }

################furnishTypes_lst = ['furnished', 'unfurnished', 'partFurnished']   # partFurnished gives back more than delta

#---------------------------------------------

# get internal IDs
def get_ids_rm(bs):
    ids = bs.find_all(class_ = 'l-searchResult is-list')
    ids_lst = [id.get('id') for id in ids]
    ids_lst = [(str(x)) for x in ids_lst]
    return(ids_lst)
# ids_lst

#------------------------- new new
#-------------------------

# get property type
def get_property_type_rm(bs):
    types_lst = [title.text.strip().split()[-1] for title in bs.findAll('h2', {'class': 'propertyCard-title'})]
    return(types_lst)


# get number of bedrooms
def get_bedrooms_rm(bs):
    bedrooms_lst = [title.text.split('bedroom')[0].strip() for title in bs.findAll('h2', {'class': 'propertyCard-title'})]
    return(bedrooms_lst)

# get number of bathrooms
def get_bathrooms_rm(bs):
    bathrooms = bs.find_all(class_= "propertyCard-features")
    bathrooms_lst = [bathroom.get_text() for bathroom in bathrooms]
    bathrooms_lst = list(filter(None, bathrooms_lst)) #delete any empty strings from list
    # format bathrooms
    bathrooms_lst = [bathroom.split('bathroom')[0].strip() for bathroom in bathrooms_lst]
    return(bathrooms_lst)


# function for prices monthly and weekly(=sec_prices)
def get_prices_rm(bs):
    prices = bs.find_all(class_= "propertyCard-priceValue")
    prices_lst = [price.get_text() for price in prices]
    prices_lst = list(filter(None, prices_lst)) #delete any empty strings from list
    # format prices
    prices_lst = [price.removesuffix(' pcm').replace('£', '').replace(',', '') for price in prices_lst]
    #prices_lst = [float(price) for price in prices_lst] #comment out to avoid error when no price is found or when price is not a number("POA")
    
    return(prices_lst)  


# def get_sec_prices_rm(bs):
#     sec_prices = bs.find_all(class_= "propertyCard-secondaryPriceValue")
#     sec_prices_lst = [sec_price.get_text() for sec_price in sec_prices]
#     sec_prices_lst = list(filter(None, sec_prices_lst)) #delete any empty strings from list
#     #format sec_prices
#     sec_prices_lst = [sec_price.removesuffix(' pw').replace('£', '').replace(',', '') for sec_price in sec_prices_lst]
#     #sec_prices_lst = [float(sec_price) for sec_price in sec_prices_lst]
#     return(sec_prices_lst)

#---------------------------------------------

def page_results(loc_id, furniture):
    # get content from immo website; create souppr
    page = requests.get(
        "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%{}&index={}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=houseShare%2Cretirement%2Cstudent&furnishTypes={}&keywords=".format(loc_id, '0', furniture))
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    # get number of search results
    num_results = bs.find(class_="searchHeader-resultCount").get_text()
    num_results = num_results.replace(',', '')
    num_results = int(num_results)
    num_pages = math.ceil(num_results / 24) # don't know why, but have to use 24 instead of 25 (results per page)

    # list of page indices we can use for the url to check every single page of search results
    page_indices = [x*24 for x in range(0, num_pages)]

    # ...
    df_search_rm = pd.DataFrame()

    for page_number in page_indices:
        # ...
        page = requests.get(
            "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%{}&index={}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=houseShare%2Cretirement%2Cstudent&keywords=".format(loc_id, page_number))
        html = page.content
        bs = BeautifulSoup(html, 'html.parser')

        # Create a dictionary to store the results from every loop cycle.
        # The keys are the column names and the values are the functions we created before.
        # The functions are called with the beautiful soup object as a parameter.
        rightmove_dict = {
            'platform_id': get_ids_rm(bs)
            ,'platform': 'Rightmove'
            ,'neighbourhood': locations_dict[loc_id]
            ,'property_type': get_property_type_rm(bs)
            ,'bedrooms': get_bedrooms_rm(bs)
            #,'bathrooms': get_bathrooms_rm(bs)
                        ,'prices_pcm': get_prices_rm(bs)
            #,'title': get_title_rm(bs)
            #,'furniture': furniture    ########### avoid duplicates in the first place
            #,'furniture': get_furniture_rm(bs)
            #,'available_from': get_available_from_rm(bs)
            #,'size': get_size_rm(bs)
            #,'scraping_date': datetime.today().strftime('%Y-%m-%d')
            #,
            }
        
        # dictionary is stored in a 'temporary' dataframe within (each) loop cycle
        df_page_rm = pd.DataFrame(rightmove_dict)

        # the temporary dataframe is appended to the dataframe we set up before the loop
        df_search_rm = pd.concat([df_search_rm, df_page_rm], ignore_index=True)

    # the data frame search gets returned to the for loop to access it outside the function
    return(df_search_rm)

# see above: furnishTypes_lst = ['furnished', 'unfurnished', 'partFurnished']
# see above: rightmove_loc_id = ['5E61407', '5E61229', '5E93941', '5E61233', '5E61224', '5E93953', '5E93971', '5E61417', '5E93965']

#------------------------------------------------------------
#------------------------------------------------------------

# set up empty dataframe
df_complete_rm = pd.DataFrame()

for loc_id in rightmove_loc_id:
    df_complete_rm = pd.concat([df_complete_rm, page_results(loc_id)], ignore_index=True)

# append column with today's date  
df_complete_rm['scraping_date'] = datetime.today().strftime('%Y-%m-%d')

#------------------------------------------------------------
#------------------------------------------------------------

# Drop IDs that are not unique
df_complete_rm.drop_duplicates(subset = ['platform_id'], inplace = True)

# convert/clean platform_id
df_complete_rm['platform_id'] = df_complete_rm['platform_id'].removeprefix('property-')#.astype(int)

In [11]:
# import the data frame to DBeaver

# call the schema created for this project
schema = 'capstone_jmrs'
# get the function to connect to the database
engine = get_engine()

# give the table a unique name
table_name = 'rightmove_1'

# import the table to sql
if engine!=None:
    try:
        df_complete_rm.to_sql(name=table_name,
                        con=engine,
                        if_exists='replace',
                        schema=schema, 
                        index=False,
                        chunksize=5000, 
                        method='multi')
        print(f"The {table_name} table was imported successfully.")
    
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The rightmove_1 table was imported successfully.


# Detail Pages 
## important: still need to implement correct order of im-/exporting from/to Database!

In [None]:
# get list of platform_ids from DBeaver first

schema = 'capstone_jmrs'

sql = f"""
SELECT platform_id 
FROM {schema}.rightmove_3
"""

df = get_dataframe(sql)
ids = df['platform_id'].tolist()

#df_details_complete = pd.DataFrame()
#df_basics_complete = pd.DataFrame()

#------------------------------------------------------------
### START OF LOOP ###
#------------------------------------------------------------
for id in ids:
    time.sleep(random.randint(2,5)/10)
    
    # get content of detail website
    page = requests.get(f"https://www.rightmove.co.uk/properties/{id}#/?channel=RES_LET")
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')
#------------------------------------------------------------
# Part A: details
    # set up dictionary for details; fill with default values if particular detail not available on website
    bedrooms, bathrooms, size, property_type = 'NA', 'NA', 'NA', 'NA' 
    row_dict = {'property_id':id, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'size': size, 'property_type': property_type}

    # get property details via explicit class
    details = bs.find(
        'div', class_='_4hBezflLdgDMdFtURKTWh')

    ## skip property if no details available (because property has already been removed); otherwise loop would break  
    if details == None:
        continue

    # convert details to list
    details_lst = (detail.get_text() for detail in details)
    details_lst = [detail.strip() for detail in details_lst]


    # loop through list and assign values to dictionary keys (necessary since list length is not fixed);
    # if no value is available, the default value (NA) is kept
    # (could also be done with try/except) ?!
    for detail in details_lst:
        if 'TYPE' in detail:
            row_dict['property_type'] = detail.removeprefix('PROPERTY TYPE')
        elif 'BEDROOMS' in detail:
            row_dict['bedrooms'] = detail[-1]
        elif 'BATHROOM' in detail:
            row_dict['bathrooms'] = detail[-1]
        elif 'SIZE' in detail:
            row_dict['size'] = re.search(r'\((.*?)\)', str(detail)).group(1).removesuffix(' sq. m.').replace(',', '')
    
    # store in details dataframe for this explicit id
    df_details = pd.DataFrame(row_dict, index=[0])

    # append to complete details dataframe    
    df_details_complete = pd.concat([df_details_complete, df_details], ignore_index=True)
#------------------------------------------------------------
# Part B: basics
    # via another class: get other basic details, here called "basics" (Let available from, Furnished, Title, ...)
    basics = bs.find_all(
        'div', class_='_2RnXSVJcWbWv4IpBC1Sng6')

    # convert basics to list
    # like ['Let available from:  1st May 2021', 'Deposit: Ask agent...', 'Furnished: Furnished', ...]
    basics_lst = (basic.get_text() for basic in basics)

    # split list entries by ':' and strip whitespaces. This returns List(fixed? len=5) of Lists(fixed len=2)
    # e.g. [['Let available from', ' 1st May 2021'], ['Deposit', 'Ask agent...'], ['Furnish Type', 'Furnished'], ...]
    basics_lst = [basic.strip().split(': ') for basic in basics_lst]
   
    # writes basics_lst into dict like {'Let available from': '01/08/2021', 'Furnished': 'Unfurnished', 'Title': 'Flat', ...}
    itemDict = {item[0]: item[1] for item in basics_lst}
    # add explicit id value to dict
    itemDict['platform_id'] = id

    # store basics in dataframe for this explicit id
    df_basics = pd.DataFrame(itemDict, index=[0])

    # append id-specific df to df_basics_complete containing all of already looped properties
    df_basics_complete = pd.concat([df_basics_complete, df_basics], ignore_index=True)
#------------------------------------------------------------
### END OF LOOP ###
#------------------------------------------------------------

# Part C:
# drop columns from basics_df we don't need
df_basics_complete.drop(['Deposit', 'Min. Tenancy', 'Council Tax'], axis=1, inplace=True)

# pythonise column names 
df_basics_complete.columns.values[0:3] = ['available_from', 'let_type', 'furnished']

# concatenate both details and basics df to new_df
new_df = pd.concat([df_details_complete, df_basics_complete], axis=1)

# add date of scraping
new_df['scraping_date'] = datetime.today().strftime('%Y-%m-%d')

# drop duplicates
new_df.drop_duplicates(inplace=True)


# import the data frame to DBeaver
# call the schema created for this project
schema = 'capstone_jmrs'
# get the function to connect to the database
engine = get_engine()

# give the table a unique name
table_name = 'rightmove_details'

# import the table to sql
if engine!=None:
    try:
        new_df.to_sql(name=table_name,
                        con=engine,
                        if_exists='replace',
                        schema=schema, 
                        index=False,
                        chunksize=5000, 
                        method='multi')
        print(f"The {table_name} table was imported successfully.")
    
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

# first run took approx 124minutes

In [7]:
# import time
# import random

# import requests
# from bs4 import BeautifulSoup
# import pandas as pd
# import re
# import pandas as pd
# from datetime import datetime
# import math 

# from sql_functions import *    

# import psycopg2    

# page = requests.get(f"https://www.rightmove.co.uk/properties/127325693#/?channel=RES_LET")
# html = page.content
# bs = BeautifulSoup(html, 'html.parser')


basics = bs.find_all(
    'div', class_='_2RnXSVJcWbWv4IpBC1Sng6')

# convert basics to list
basics_lst = (basic.get_text() for basic in basics)
# split list entries by ':' and strip whitespaces
basics_lst = [basic.strip().split(': ') for basic in basics_lst]
basics_lst
itemDict = {item[0]: item[1] for item in basics_lst}
itemDict

{'Let available date': 'Ask agent',
 'Deposit': 'Ask agentA deposit provides security for a landlord against damage, or unpaid rent by a tenant.Read more about deposit in our glossary page.',
 'Min. Tenancy': 'Ask agentHow long the landlord offers to let the property for.Read more about tenancy length in our glossary page.',
 'Let type': 'Long term',
 'Furnish type': 'Furnished'}

## new paragraph