In [3]:
# import all libraries that are needed

import requests                 # requests on websites
from bs4 import BeautifulSoup   # html parsing

import pandas as pd             # pandas for dataframe

import math                     # for math methods
import time                     # for sleep timer

from sql_functions import *     # functions from file for upload on schema

import psycopg2                 # for upload on engine

import datetime as dt           # for the csv file with the current date and time

# libraries that are not needed right now
#import re
#import json
#import numpy as np
#from zipfile import *


# declaration of functions to get specific information from the website

# Creating a function to get all the descriptions
def get_description(bs):
    # find all the descriptions and save them to an empty list
    lst_name = []
    descriptions = bs.find_all(
        class_='homecard-content__title__HomecardContent___OmV4c homecard-content__title--rebranding-style__HomecardContent___OmV4c')
    # iterate over the descriptions to get the text and strip the strings and save them in a list
    for description in descriptions:
        lst_name.append(
            description.get_text()
                .strip()
        )
    return lst_name


# Creating a function to get all the housing types
def get_housing(bs):
    # find all the housing types and save them to an empty list
    lst_name = []
    housings = bs.find_all(
        class_='homecard-content__type__HomecardContent___OmV4c homecard-content__type--rebranding-style__HomecardContent___OmV4c')
    # iterate over the housing types to get the text and strip the strings and save them in a list
    for housing in housings:
        lst_name.append(
            housing.get_text()
                .strip()
        )
    return lst_name


# Creating a function to get all the available dates
def get_available(bs):
    # find all the available dates and save them to an empty list
    lst_name = []
    availables = bs.find_all(
        class_='homecard-content__available-from__HomecardContent___OmV4c homecard-content__available-from--rebranding-style__HomecardContent___OmV4c')
    # iterate over the available appartements to get the text and strip the string and save them in a list
    for available in availables:
        lst_name.append(
            available.get_text()
                .strip()
        )
    return lst_name


# Creating a function to get all the prices
def get_price(bs):
    # find all the prices and save them to an empty list
    lst_name = []
    prices = bs.find_all(class_='price__Price___OmV4c')
    # iterate over the prices to get the text and strip the strings and save them in a list
    for price in prices:
        lst_name.append(
            price.get_text()
                .strip()
                .replace('£', '')
        )
    return lst_name


# Creating a function to get all the prices per period
def get_prices_period(bs):
    # find all the aprices per period and save them to an empty list
    lst_name = []
    prices_period = bs.find_all(
        class_='price-monthly__Price___OmV4c price-monthly--rebranding-style__Price___OmV4c')
    # iterate over the prices per period to get the text and strip the string and save them in a list
    for price_period in prices_period:
        lst_name.append(
            price_period.get_text()
                .strip()
                .replace('/', '')
        )
    return lst_name


# Creating a function to get all the ID's
def get_ids(bs):
    # find all the prices and save them to an empty list
    lst_name = []
    ids = bs.find_all(class_ = 'l-list__item')
    # iterate over the prices to get the text and strip the strings and save them in a list
    for id in ids:
        lst_name.append(
            id.get('data-homecard-scroll')
                .strip()
        )
    return lst_name

    

# Create dictionary in which every location ID gets assigned a location name
location_dict = {219: 'lambeth',
                 231: 'hammersmith_and_fulham',
                 232: 'kensington_and_chelsea',
                 233: 'city_of_westminster',
                 234: 'camden',
                 235: 'tower_of_hamlets',
                 236: 'islington',
                 237: 'hackney',
                 241: 'city_of_London'
                 }


# Creating a function to get the search result from all pages
# the website spotahome shows 60 search results per page. To iterate trough all the pages, we get the information how many search results are there, then divide it by 60 and round it up to get the number of pages.
def page_results(property_type, location):
    # get the url from the website with the property type and the location as a variable to iterate trough it
    page = requests.get(
        'https://www.spotahome.com/s/london--uk/for-rent:{}?areaId[]={}'.format(property_type, location))
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    # Extracting the total number of search results
    results = bs.find_all('h1', {'class': 'search-title__title'})

    # define the variable result_text in case the first search gives us no results
    result_text = 0

    # iterate over the results to get the text and strip the string
    for result in results:
        result_text = result.find("strong").get_text().strip()

    # convert the extracted string to an integer to perform mathematical operations
    result_converted = int(result_text)

    # divide the converted result by 60 since one pages shows 60 results and round it up to get the number of pages
    page_site = result_converted / 60
    page_site = math.ceil(page_site)

    # convert the number of pages from a float to an integer to iterate trough the pages
    page_converted = int(page_site)

    # create an empty data frame to store the results from every loop cycle
    df_search = pd.DataFrame()

    # split the url to get access to the part where the page is definde
    begin = 'https://www.spotahome.com/s/london--uk/for-rent:{}'.format(
        property_type)
    end = '?areaId[]={}'.format(location)

    # range is including in the beginning and excluding in the end so we add plus 1 to iterate through all calculated pages
    page_converted = page_converted + 1

    # for loop to get the page numbers
    for page_number in range(page_converted):
        # sleep timer to reduce the traffic for the server
        # time.sleep(0.5)

        # get the url from the website with the property type, the location and the page number as a variable to iterate trough it
        page = requests.get(begin+f'/page:{page_number}'+end)
        html = page.content
        bs = BeautifulSoup(html, 'html.parser')

        # Create a dictionary to store the results from every loop cycle.
        # The keys are the column names and the values are the functions we created before.
        # The functions are called with the beautiful soup object as a parameter.
        spotahome_dict = {
            'id': get_ids(bs),
            'description': get_description(bs),
            'housing_type': get_housing(bs),
            'property_type': property_type,
            'location': location_dict[location],
            'available': get_available(bs),
            'prices': get_price(bs),
            'prices_per': get_prices_period(bs)
        }
        # the ditionary is stored in a dataframe
        df_page = pd.DataFrame(data=spotahome_dict)

        # the temporary data frame gets appended to the data frame we created earlier outside the for loop
        # for every iteration, the data frame page stores the results in the data frame search
        df_search = df_search.append(df_page)
    # the data frame search gets returned to the for loop to access it outside the function
    return (df_search)



# creating a list with different property types given from the website
property_types = ['studios', 'apartments/bedrooms:1',
                  'apartments/bedrooms:2', 'apartments/bedrooms:3', 'apartments/bedrooms:3more']
# creating a list with the different location IDs given from the website
locations = [219, 231, 232, 233, 234, 235, 236, 237, 241]

# creating an empty data frame
df_complete = pd.DataFrame()
# for loop to get the different property types
for property_type in property_types:
    # for loop to get the different locations
    for location in locations:
        # append the result from data frame search by calling the function page_results with the property type and the location as a parameter to data frame complete
        df_complete = df_complete.append(page_results(property_type, location))

  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(property_type, location))
  df_search = df_search.append(df_page)
  df_complete = df_complete.append(page_results(

In [4]:
# display the data frame
display(df_complete)

Unnamed: 0,id,description,housing_type,property_type,location,available,prices,prices_per
0,298871,"Studio Apartment for rent in Kensal Green, London",Studio,studios,hammersmith_and_fulham,From 15 October,1452,month
1,654298,The Armadale Road Residence II,Studio,studios,hammersmith_and_fulham,From 10 October,2650,month
2,654299,The Armadale Road Residence III,Studio,studios,hammersmith_and_fulham,From 20 December,2500,month
3,113567,Studio flat with double bed to rent in Kensing...,Studio,studios,hammersmith_and_fulham,From 18 June 2023,1249,month
0,607194,"Studio for rent in Earls Court, London",Studio,studios,kensington_and_chelsea,From 30 September,3033,month
...,...,...,...,...,...,...,...,...
3,844665,4-bedroom apartment for rent in London,Apartment,apartments/bedrooms:3more,hackney,From 20 November,8250-10950,month
4,843245,4-bedroom apartment for rent in London,Apartment,apartments/bedrooms:3more,hackney,From 11 January 2023,9000-11250,month
5,398966,4-bedroom apartment in Hackney,Apartment,apartments/bedrooms:3more,hackney,From 6 January 2024,2100,month
6,602243,4 bedrooms warehouse conversion for rent in Ha...,Apartment,apartments/bedrooms:3more,hackney,From 4 August 2023,9000-11250,month


In [5]:
# get information about the data frame
# results for 5 different property types and 9 different locations with 60 search results per page is 844 on 22.09.2022 10:00 h
df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 841 entries, 0 to 7
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             841 non-null    object
 1   description    841 non-null    object
 2   housing_type   841 non-null    object
 3   property_type  841 non-null    object
 4   location       841 non-null    object
 5   available      841 non-null    object
 6   prices         841 non-null    object
 7   prices_per     841 non-null    object
dtypes: object(8)
memory usage: 59.1+ KB


In [7]:
def get_details(bs):
    details = bs.find_all(class_='property-title__details')
    details_lst = (detail.get_text() for detail in details)
    details_lst = [detail.strip() for detail in details_lst]
    return details_lst

df2 = pd.DataFrame()

for idx, row in df_complete.iterrows():
    page = requests.get(
        'https://www.spotahome.com/london/for-rent:{}/{}'.format(row['housing_type'].lower() + 's', row['id']))
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    details_dict = {
        'id': row['id'],
        'housing': row['housing_type'].lower() + 's',
        'details': get_details(bs)
    }
    df1 = pd.DataFrame(data=details_dict)
    df2 = df2.append(df1)


  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.append(df1)
  df2 = df2.appe

In [8]:
display(df2)

Unnamed: 0,id,housing,details
0,298871,studios,Studio1 bathrooms
0,654298,studios,Studio1 bathrooms
0,654299,studios,Studio1 bathrooms
0,113567,studios,Studio1 bathrooms
0,607194,studios,Studio
...,...,...,...
0,844665,apartments,Apartment4 bedrooms2 bathrooms
0,843245,apartments,Apartment4 bedrooms2 bathrooms
0,398966,apartments,Apartment5 bedrooms2 bathrooms85 m2
0,602243,apartments,Apartment4 bedrooms2 bathrooms


In [None]:
# # import the data frame to DBeaver

# # call the schema created for this project
# schema = 'capstone-jmrs'
# # get the function to connect to the database
# engine = get_engine()

# # give the table a unique name
# table_name = 'spotahome'

# # import the table to sql
# if engine!=None:
#     try:
#         df_complete.to_sql(name=table_name,
#                         con=engine,
#                         if_exists='replace',
#                         schema=schema, 
#                         index=False,
#                         chunksize=5000, 
#                         method='multi')
#         print(f"The {table_name} table was imported successfully.")
    
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None

In [None]:
# # create a .csv file with the current date and time
# today = dt.datetime.today().strftime('%Y-%m-%d-%H-%M')
# df_complete.to_csv('spotahome_{}.csv'.format(today), sep='\t')