# [Spotahome.com](https://www.spotahome.com/s/london--uk/for-rent:apartments/for-rent:studios/bedrooms:1/bedrooms:2/bedrooms:3/bedrooms:3more?areaId[]=219&areaId[]=231&areaId[]=232&areaId[]=233&areaId[]=234&areaId[]=235&areaId[]=236&areaId[]=237&areaId[]=241)

## Dataframe with information from search result page (runtime ~1m 20s)

In [None]:
import requests                 # requests on websites
from bs4 import BeautifulSoup   # html parsing
import pandas as pd             # pandas for data frame
import math                     # for math methods
import time                     # for sleep timer
import random
from sql_functions import *     # functions from file for upload on schema
import psycopg2                 # for upload on engine
import datetime as dt           # for the csv file with the current date and time
import re


def get_description(bs):
    lst_name = []
    descriptions = bs.find_all(
        class_='homecard-content__title__HomecardContent___OmV4c homecard-content__title--rebranding-style__HomecardContent___OmV4c')
    for description in descriptions:
        lst_name.append(
            description.get_text()
                .strip()
        )
    return lst_name


def get_housing(bs):
    lst_name = []
    housings = bs.find_all(
        class_='homecard-content__type__HomecardContent___OmV4c homecard-content__type--rebranding-style__HomecardContent___OmV4c')
    for housing in housings:
        lst_name.append(
            housing.get_text()
                .strip()
        )
    return lst_name


def get_available(bs):
    lst_name = []
    availables = bs.find_all(
        class_='homecard-content__available-from__HomecardContent___OmV4c homecard-content__available-from--rebranding-style__HomecardContent___OmV4c')
    for available in availables:
        lst_name.append(
            available.get_text()
                .strip()
                .replace('From ', '')
        )
    return lst_name


def get_price(bs):
    lst_name = []
    prices = bs.find_all(class_='price__Price___OmV4c')
    for price in prices:
        lst_name.append(
            price.get_text()
                .strip()
                .replace('£', '')
                .split('-')[0]
        )
    return lst_name


def get_prices_period(bs):
    lst_name = []
    prices_period = bs.find_all(
        class_='price-monthly__Price___OmV4c price-monthly--rebranding-style__Price___OmV4c')
    for price_period in prices_period:
        lst_name.append(
            price_period.get_text()
                .strip()
                .replace('/', '')
        )
    return lst_name


def get_ids(bs):
    lst_name = []
    ids = bs.find_all(class_='l-list__item')
    for id in ids:
        lst_name.append(
            id.get('data-homecard-scroll')
                .strip()
        )
    return lst_name


# Create dictionary in which every location ID gets assigned a location name
location_dict = {219: 'Lambeth',
                 231: 'Hammersmith and Fulham',
                 232: 'Kensington and Chelsea',
                 233: 'City of Westminster',
                 234: 'Camden',
                 235: 'Tower Hamlets',
                 236: 'Islington',
                 237: 'Hackney',
                 241: 'City of London'
                 }


# the website spotahome shows 60 search results per page. To iterate trough all the pages, we get the information how many search results are there, then divide it by 60 and round it up to get the number of pages.
def page_results(property_type, location):
    page = requests.get(
        f'https://www.spotahome.com/s/london--uk/for-rent:{property_type}?areaId[]={location}')
    html = page.content
    bs = BeautifulSoup(html, 'html.parser')

    # Extracting the total number of search results
    results = bs.find_all('h1', {'class': 'search-title__title'})

    result_text = 0

    for result in results:
        result_text = result.find("strong").get_text().strip()

    # convert the extracted string to an integer to perform mathematical operations
    result_converted = int(result_text)

    # divide the converted result by 60 since one pages shows 60 results and round it up to get the number of pages
    page_site = result_converted / 60
    page_site = math.ceil(page_site)

    # convert the number of pages from a float to an integer to iterate through the pages
    page_converted = int(page_site)

    df_search = pd.DataFrame()

    begin = f'https://www.spotahome.com/s/london--uk/for-rent:{property_type}'
    end = f'?areaId[]={location}'

    # range is including in the beginning and excluding in the end so we add plus 1 to iterate through all calculated pages
    page_converted = page_converted + 1

    for page_number in range(page_converted):
        time.sleep(random.randint(2,6)/10)
        page = requests.get(begin+f'/page:{page_number}'+end)
        html = page.content
        bs = BeautifulSoup(html, 'html.parser')

        # Create a dictionary to store the results from every loop cycle.
        # The keys are the column names and the values are the functions we created before.
        # The functions are called with the beautiful soup object as a parameter.
        spotahome_dict = {
            'platform_id': get_ids(bs),
            'platform': 'spotahome',
            'neighborhood': location_dict[location],
            'property_type': property_type,
            'housing_type': get_housing(bs),
            'price_pcm': get_price(bs),
            'title': get_description(bs),
            'furnished': 'furnished',
            'available_from': get_available(bs),
        }
        df_page = pd.DataFrame(data=spotahome_dict)

        # the temporary data frame stores the data to the data frame we created earlier outside the for loop
        # for every iteration, the data frame page stores the results in the data frame search
        df_search = pd.concat([df_search, df_page], axis=0, ignore_index=True)
    return (df_search)


property_types = ['studios', 'apartments/bedrooms:1',
                  'apartments/bedrooms:2', 'apartments/bedrooms:3', 'apartments/bedrooms:3more']
locations = [219, 231, 232, 233, 234, 235, 236, 237, 241]

df_complete = pd.DataFrame()
for property_type in property_types:
    for location in locations:
        df_complete = pd.concat([df_complete, page_results(
            property_type, location)], axis=0, ignore_index=True)


In [None]:
display(df_complete)
df_complete.info()

## Dataframe with information from every detail page for every apartment advert

In [None]:
df_details_complete = pd.DataFrame()

#with iterrows we can grab the id's from the previous code, iterate trough all of them and get the details for every id
for idx, row in df_complete.iterrows():
    time.sleep(random.randint(2,6)/10)
    page = f"https://www.spotahome.com/london/for-rent:{row['housing_type'].lower() + 's'}/{row['platform_id']}"
    website = requests.get(page)
    results = BeautifulSoup(website.content, 'html.parser')

    details = results.find(
        'div', class_='property-title__details').find_all('span')

    #our information is first stored in a list
    details_lst = (detail.get_text() for detail in details)
    details_lst = [detail.strip() for detail in details_lst]

    #since we don't need the property_type again, we drop that information
    details_lst.pop(0)

    #we can split our information on the space and get three lists inside a list
    details_lst = [i.split(' ', 1) for i in details_lst]

    row_dict = {}

    #the three lists are now get stored in a dictionary (with the id's)
    for value_key_tuple in details_lst:
        new_key_value = {'id': row['platform_id']}
        row_dict.update(new_key_value)
        key = value_key_tuple[1]
        value = value_key_tuple[0]
        row_dict[key] = value

    #the dictionary is then converted to a data frame
    df_details = pd.DataFrame(data=row_dict, index=[0])
    df_details_complete = pd.concat([df_details_complete, df_details])


In [None]:
df_details_complete
df_details_complete.info()

## Import df_complete to DBeaver

In [None]:
# # call the schema created for this project
# schema = 'capstone_jmrs'
# # get the function to connect to the database
# engine = get_engine()

# # give the table a unique name
# table_name = 'spotahome_df_complete_2'

# # import the table to sql
# if engine != None:
#     try:
#         df_complete.to_sql(name=table_name,
#                            con=engine,
#                            if_exists='replace',
#                            schema=schema,
#                            index=False,
#                            chunksize=5000,
#                            method='multi')
#         print(f"The {table_name} table was imported successfully.")

#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None

## Import df_details_complete to DBeaver

In [None]:
# # call the schema created for this project
# schema = 'capstone_jmrs'
# # get the function to connect to the database
# engine = get_engine()

# # give the table a unique name
# table_name = 'spotahome_df_details_complete'

# # import the table to sql
# if engine != None:
#     try:
#         df_details_complete.to_sql(name=table_name,
#                                    con=engine,
#                                    if_exists='replace',
#                                    schema=schema,
#                                    index=False,
#                                    chunksize=5000,
#                                    method='multi')
#         print(f"The {table_name} table was imported successfully.")

#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
