# Web scrapping for house pricing information

In [1]:
from bs4 import BeautifulSoup
from requests import get  #  make queries look like they are from actual browser
from time import sleep  # to slow down scrapping and not overload website
from random import randint
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import re
import seaborn as sns; sns.set_theme(style='darkgrid')

In [41]:
web = "https://www.etuovi.com/myytavat-asunnot?haku=M1606820702&sivu=1"
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})


## Get the source code in HTML

In [42]:
response = get(web, headers)

In [43]:
print(response)

<Response [200]>


Print the first 1000 characters of the source code.

In [44]:
response.text[:1000]

'\n<!DOCTYPE html>\n<html lang="fi">\n    <head>\n        <script>\n            window.etuovi = {\n                ad: {\n                    qskvconsent: false,\n                    placements: {}\n                }\n            }\n        </script>\n        <script type="text/javascript" src="https://acdn.adnxs.com/ast/ast.js"></script> <script type="text/javascript" src="https://nexus.ensighten.com/alma/etuovivaltti/Bootstrap.js"></script>\n        <meta charset="UTF-8">\n        <meta httpEquiv="x-ua-compatible" content="ie=edge,chrome=1" />\n        <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=1, shrink-to-fit=no" />\n        <title data-rh="true">Myytävät asunnot Espoo · Helsinki · Vantaa: 6\xa0221 kpl - Etuovi.com</title>\n        <meta data-rh="true" name="author" content="Alma Mediapartners Oy"/><meta data-rh="true" name="description" content="Etuovi.com:issa on juuri nyt 6\xa0221 kohdetta tuoteryhmässä Myytävät asunnot alueella Espoo · H

In [45]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [57]:
print(html_soup.prettify())

<!DOCTYPE html>
<html lang="fi">
 <head>
  <script>
   window.etuovi = {
                ad: {
                    qskvconsent: false,
                    placements: {}
                }
            }
  </script>
  <script src="https://acdn.adnxs.com/ast/ast.js" type="text/javascript">
  </script>
  <script src="https://nexus.ensighten.com/alma/etuovivaltti/Bootstrap.js" type="text/javascript">
  </script>
  <meta charset="utf-8"/>
  <meta content="ie=edge,chrome=1" httpequiv="x-ua-compatible">
   <meta content="width=device-width, initial-scale=1.0, user-scalable=1, shrink-to-fit=no" name="viewport"/>
   <title data-rh="true">
    Myytävät asunnot Espoo · Helsinki · Vantaa: 6 221 kpl - Etuovi.com
   </title>
   <meta content="Alma Mediapartners Oy" data-rh="true" name="author"/>
   <meta content="Etuovi.com:issa on juuri nyt 6 221 kohdetta tuoteryhmässä Myytävät asunnot alueella Espoo · Helsinki · Vantaa. Tee helppo haku ja löydä uusi kotisi jo tänään!" data-rh="true" name="descripti

In [46]:
house_containers = html_soup.find_all('div', class_="ListPage__cardContainer__39dKQ")

## Process first result

In [101]:
house = house_containers[0] # get first element


In [103]:
link = "https://www.etuovi.com" + house.find('a', class_="styles__cardLink__2Oh5I")['href']
title = house.find_all('h5')[0].text
address = house.find_all('h4')[0].text
price_str = house.find(string=re.compile(".*€")).replace('\xa0', '')
price = float(re.sub(r"\s+|€", "", price_str))
size_html = house.find('div', class_="flexboxgrid__col-xs__26GXk flexboxgrid__col-md-4__2DYW-")
size_str = size_html.find_all('span')[1].text.replace(',', '.')
size = float(re.sub(r".m²", "", size_str))
year_html = house.find('div', class_="flexboxgrid__col-xs-3__3Kf8r flexboxgrid__col-md-4__2DYW-")
year = int(year_html.find_all('span')[1].text)

print(f"""
-Title: {title}
-Address: {address}
-Link: {link}
-Price: {price}
-Size: {size}
-Year: {year}""")


-Title: Kerrostalo | 2h, k, parveke
-Address: Castréninkatu 9-11 B, Kallio, Helsinki
-Link: https://www.etuovi.com/kohde/698608?haku=M1606820702
-Price: 315000.0
-Size: 55.0
-Year: 1973


## Parse through all the website

Now that we have found the right way to extract information from the HTML source code, we can parse the entire source code looking for all the ads.

In [161]:
def parse_etuovi():
    """
    Method to parse through all the entries in "https://www.etuovi.com" with 
    respect to the cities of Helsinki, Espoo and Vantaa
    Inputs
    ------
        - 
    """
    N_TOTAL = 200  # total number of pages to look for ads
    current = 0  # number of current page

    # Initialize empty lists for variables to store
    links = []
    titles = []
    addresses = []
    prices = []
    sizes = []
    years = []

    for page in range(N_TOTAL):

        web = "https://www.etuovi.com/myytavat-asunnot?haku=M1606820702&sivu=" + str(page)
        headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
        try:
            response = get(web, headers)
        except:
            print("Invalid web request.")

        # Create HTML soup
        html_soup = BeautifulSoup(response.text, 'html.parser')
        house_containers = html_soup.find_all('div', class_="ListPage__cardContainer__39dKQ")

        if house_containers != []:
            # Iterate through every container
            for house in house_containers:
                try:
                    # Extract data
                    link = "https://www.etuovi.com" + house.find('a', class_="styles__cardLink__2Oh5I")['href']
                    title = house.find_all('h5')[0].text
                    address = house.find_all('h4')[0].text
                    price_str = house.find(string=re.compile(".*€")).replace('\xa0', '')
                    price = float(re.sub(r"\s+|€", "", price_str))
                    size_html = house.find('div', class_="flexboxgrid__col-xs__26GXk flexboxgrid__col-md-4__2DYW-")
                    size_str = size_html.find_all('span')[1].text.replace(',', '.')
                    size = float(re.sub(r".m²", "", size_str))
                    year_html = house.find('div', class_="flexboxgrid__col-xs-3__3Kf8r flexboxgrid__col-md-4__2DYW-")
                    year = int(year_html.find_all('span')[1].text)
                                    
                    # Append data to each list
                    links.append(link)
                    titles.append(title)
                    addresses.append(address)
                    prices.append(price)
                    sizes.append(size)
                    years.append(year)
                except:
                    print(f"Error extracting data from {house}.")

        # Wait 1-2 seconds to avoid overloading the website
        sleep(randint(1,2))
        print(f"Page #{page} scrapped.")

        # Update page counter
        page += 1

    # Store information in a DataFrame
    df = pd.DataFrame({"Link": links, "Title": titles,
                        "Address": addresses, "Price":prices,
                        "Size": sizes, "Year": years})
    return df

In [162]:
%%time 
df = parse_etuovi()

Page #0 scrapped.
Page #1 scrapped.
Error extracting data from <div class="ListPage__cardContainer__39dKQ"><a class="styles__cardLink__2Oh5I" href="/kohde/q48482?haku=M1606820702" id="q48482" theme="[object Object]"><div class="theme__card__3k-E3 styles__listitem__3gJxH" data-react-toolbox="card"><div class="flexboxgrid__row__wfmuy flexboxgrid__start-xs__napeK flexboxgrid__between-xs__1FR8U"><div class="flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-5__3cpPD flexboxgrid__col-md-3__1YPhN flexboxgrid__col-lg-3__2YYeD styles__imageContainer__2ocS9"><div class="theme__cardMedia__2HFB6 theme__wide__AaIGk styles__cardMedia__1ob_N"><div class="theme__content__2Zc9X"><div class="theme__cardActions__28OcX styles__itemActionsTop__1rc7O"><button class="theme__button__1YqFK theme__flat__13aFK theme__button__1YqFK theme__squared__17Uvn theme__neutral__1F1Jf Button__button__3K-jn" data-react-toolbox="button" disabled="" type="button">Uusi<em>24<!-- -->H</em></button></div><div class="theme__cardA

In [164]:
df.shape

(5228, 6)

In [169]:
# Save data frame
df.to_excel("helsinki_house_price.xls")