## Import libraries

In [9]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pandas as pd
from datetime import datetime
import math 

from sql_functions import *    

import psycopg2    

# create global list of targeted neighbourhoods AND match with platform location IDs
neighbourhoods = ['Hammersmith and Fulham', 'Kensington and Chelsea', 'Camden', 'City of Westminster', 'City of London', 'Hackney', 'Lambeth', 'Tower of Hamlets', 'Islington']
rightmove_loc_id = ['5E61407', '5E61229', '5E93941', '5E61233', '5E61224', '5E93953', '5E93971', '5E61417', '5E93965']
zip_iterator = zip(rightmove_loc_id, neighbourhoods)
locations_dict = dict(zip_iterator)
    # {
    # '5E61407': 'Hammersmith and Fulham',
    # '5E61229': 'Kensington and Chelsea',
    # '5E93941': 'Camden',
    # '5E61233': 'City of Westminster',
    # '5E61224': 'City of London',
    # '5E93953': 'Hackney',
    # '5E93971': 'Lambeth',
    # '5E61417': 'Tower of Hamlets',
    # '5E93965': 'Islington'
    # }

furnishTypes_lst = ['furnished', 'unfurnished', 'partFurnished']   # partFurnished gives back more than delta

#---------------------------------------------

# get internal IDs
def get_ids_rm(bs):
    ids = bs.find_all(class_ = 'l-searchResult is-list')
    ids_lst = [id.get('id') for id in ids]
    ids_lst = [(str(x)) for x in ids_lst]
    return(ids_lst)
# ids_lst


# function for prices monthly and weekly(=sec_prices)
def get_prices_rm(bs):
    prices = bs.find_all(class_= "propertyCard-priceValue")
    prices_lst = [price.get_text() for price in prices]
    prices_lst = list(filter(None, prices_lst)) #delete any empty strings from list
    # format prices
    prices_lst = [price.removesuffix(' pcm').replace('£', '').replace(',', '') for price in prices_lst]
    #prices_lst = [float(price) for price in prices_lst] #comment out to avoid error when no price is found or when price is not a number("POA")
    
    return(prices_lst)  


def get_sec_prices_rm(bs):
    sec_prices = bs.find_all(class_= "propertyCard-secondaryPriceValue")
    sec_prices_lst = [sec_price.get_text() for sec_price in sec_prices]
    sec_prices_lst = list(filter(None, sec_prices_lst)) #delete any empty strings from list
    #format sec_prices
    sec_prices_lst = [sec_price.removesuffix(' pw').replace('£', '').replace(',', '') for sec_price in sec_prices_lst]
    #sec_prices_lst = [float(sec_price) for sec_price in sec_prices_lst]
    return(sec_prices_lst)

#---------------------------------------------

def page_results(loc_id, furniture):
    # get content from immo website; create souppr
    page = requests.get(
        "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%{}&index={}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=houseShare%2Cretirement%2Cstudent&furnishTypes={}&keywords=".format(loc_id, '0', furniture))
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    # get number of search results
    num_results = bs.find(class_="searchHeader-resultCount").get_text()
    num_results = num_results.replace(',', '')
    num_results = int(num_results)
    num_pages = math.ceil(num_results / 24) # don't know why, but have to use 24 instead of 25 (results per page)

    # list of page indices we can use for the url to check every single page of search results
    page_indices = [x*24 for x in range(0, num_pages)]

    # ...
    df_search_rm = pd.DataFrame()

    for page_number in page_indices:
        # ...
        page = requests.get(
            "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%{}&index={}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=houseShare%2Cretirement%2Cstudent&furnishTypes={}&keywords=".format(loc_id, page_number, furniture))
        html = page.content
        bs = BeautifulSoup(html, 'html.parser')

        # Create a dictionary to store the results from every loop cycle.
        # The keys are the column names and the values are the functions we created before.
        # The functions are called with the beautiful soup object as a parameter.
        rightmove_dict = {
            'platform_id': get_ids_rm(bs)
            ,'platform': 'Rightmove'
            ,'neighbourhood': locations_dict[loc_id]
            #,'property_type': get_property_type_rm(bs)
            #,'bedrooms': get_bedrooms_rm(bs)
            #,'bathrooms': get_bathrooms_rm(bs)
            ,'prices_pcm': get_prices_rm(bs)
            #,'title': get_title_rm(bs)
            ,'furniture': furniture
            #,'furniture': get_furniture_rm(bs)
            #,'available_from': get_available_from_rm(bs)
            #,'size': get_size_rm(bs)
            #,'scraping_date': datetime.today().strftime('%Y-%m-%d')
            #,
            }
        
        # dictionary is stored in a 'temporary' dataframe within (each) loop cycle
        df_page_rm = pd.DataFrame(rightmove_dict)

        # the temporary dataframe is appended to the dataframe we set up before the loop
        df_search_rm = pd.concat([df_search_rm, df_page_rm], ignore_index=True)

    # the data frame search gets returned to the for loop to access it outside the function
    return(df_search_rm)

# see above: furnishTypes_lst = ['furnished', 'unfurnished', 'partFurnished']
# see above: rightmove_loc_id = ['5E61407', '5E61229', '5E93941', '5E61233', '5E61224', '5E93953', '5E93971', '5E61417', '5E93965']

#------------------------------------------------------------
#------------------------------------------------------------

# set up empty dataframe
df_complete_rm = pd.DataFrame()

for loc_id in rightmove_loc_id:
    for furniture in furnishTypes_lst:
        df_complete_rm = pd.concat([df_complete_rm, page_results(loc_id, furniture)], ignore_index=True)

# append column with today's date  
df_complete_rm['scraping_date'] = datetime.today().strftime('%Y-%m-%d')   



In [None]:
# 
df_complete_rm['platform_id'] = df_complete_rm['platform_id'].removeprefix('property-')#.astype(int)

In [10]:
df_complete_rm

Unnamed: 0,platform_id,platform,neighbourhood,prices_pcm,furniture,scraping_date
0,property-127236689,Rightmove,Hammersmith and Fulham,3683,furnished,2022-09-23
1,property-127366049,Rightmove,Hammersmith and Fulham,3750,furnished,2022-09-23
2,property-127365830,Rightmove,Hammersmith and Fulham,2600,furnished,2022-09-23
3,property-126019895,Rightmove,Hammersmith and Fulham,1350,furnished,2022-09-23
4,property-127361573,Rightmove,Hammersmith and Fulham,1750,furnished,2022-09-23
...,...,...,...,...,...,...
9720,property-124925873,Rightmove,Islington,2817,partFurnished,2022-09-23
9721,property-84868017,Rightmove,Islington,5750,partFurnished,2022-09-23
9722,property-79835193,Rightmove,Islington,3792,partFurnished,2022-09-23
9723,property-65417791,Rightmove,Islington,1582,partFurnished,2022-09-23


In [None]:
idee: alles in DB laden, dann nach duplicates suchen und löschen:

--> suchen nach häufigsten und bei diesen vor allem furniture einträge checken

In [11]:
# import the data frame to DBeaver

# call the schema created for this project
schema = 'capstone_jmrs'
# get the function to connect to the database
engine = get_engine()

# give the table a unique name
table_name = 'rightmove_1'

# import the table to sql
if engine!=None:
    try:
        df_complete_rm.to_sql(name=table_name,
                        con=engine,
                        if_exists='replace',
                        schema=schema, 
                        index=False,
                        chunksize=5000, 
                        method='multi')
        print(f"The {table_name} table was imported successfully.")
    
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The rightmove_1 table was imported successfully.


## end of working script

## new paragraph

In [None]:
# # Static URL:
# # Neighbourhood 1 - Hammersmith and Fulham; furnished
# page = requests.get("https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E61407&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=houseShare%2Cretirement%2Cstudent&furnishTypes=furnished&keywords=")
# html = page.content
# bs = BeautifulSoup(html, 'html.parser')


 ## NOT WORKING

In [None]:
b    beds = bs.find_all(class_ = 'l-searchResult is-list')
    beds_lst = [id.get('id') for id in ids]
    # _lst = [(str(x)) for x in ids_lst]

In [None]:
beds = bs.find_all(class_ = 'property-information')
beds_lst = [x.get('aria-hidden') for x in beds]
# _lst = [(str(x)) for x in ids_lst]
beds_lst

In [None]:
today = dt.datetime.today().strftime('%Y-%m-%d %H:%M') # to set the date in the csv filename
df.to_csv('spotahome_{}.csv'.format(today), sep='\t')