## Lägenhetsjägaren (The Apartment Hunter, 2022)

29 January 2022

A Swedish drama about a man who must find a flat. But not all is what it seems...

In [1]:
import requests
import pandas as pd
import numpy as np
from lxml import html

from scraping_utils import get_data_from_page, create_urllist
from cleaning_utils import clean_price_column

import re

In [2]:
headers_info = {'Host': 'www.realestate.com.au', 
                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
                'Cookie': 'reauid=165f301772670000e401f2629c030000cf740100; mid=11604324607491749210; utag_main=v_id:0183a1b78bfa000df68706d9f1870504e004900900bd0; split_audience=c; KP2_UIDz-ssn=0dwTSAWPurUNrYSnIzsFxpGTMNjc8M6Sjmoc9m5E6JesN6RTHSO4bnaxbIKDsl8lvUTczXiLmsl3Fxwn9Hf73Ypfh4VHaSSQkUdJYbX22exsC4HXWEwNzJ7I733M3Zvw0jHUTAa2cJpKzWN3H4cp09pV; KP2_UIDz=0dwTSAWPurUNrYSnIzsFxpGTMNjc8M6Sjmoc9m5E6JesN6RTHSO4bnaxbIKDsl8lvUTczXiLmsl3Fxwn9Hf73Ypfh4VHaSSQkUdJYbX22exsC4HXWEwNzJ7I733M3Zvw0jHUTAa2cJpKzWN3H4cp09pV; Country=AU; fullstory_audience_split=B',
                'Upgrade-Insecure-Requests': '1'}

## Scrape data

This gathers the high-level data for properties in each of the suburb-postcode combinations specified below.

In [3]:
suburb_postcodes = ['fairfield,+vic+3078', 'brunswick,+vic+3056', 'carlton,+vic+3053', 'hawthorn,+vic+3122',
                   'camberwell,+vic+3124', 'moonee+ponds,+vic+3039', 'fitzroy,+vic+3065', 'elsternwick,+vic+3185']

In [4]:
urls = create_urllist(suburb_postcodes, 10)

In [None]:
dfs = []

for url in urls:
    print(f'Scraping URL: {url}')
    page = requests.get(url, headers=headers_info)
    tree = html.fromstring(page.text)
    df_temp = pd.DataFrame(get_data_from_page(tree))
    dfs.append(df_temp)

In [6]:
df = pd.concat(dfs).drop_duplicates()

## Data Cleaning

Prices are recorded in a text field, often with unnecessary other text. The cleaning steps removes this and creates columns for the minimum and maximum price specified for each property.

In [7]:
df = clean_price_column(df)

## Scrape additional data for each property

This gathers the data available on each of the pages of the properties, especially the text.

In [10]:
texts = []

path_text = './/span[@class="property-description__content"]/text()'

In [None]:
# for each of the properties, get the text description
url_base = 'https://www.realestate.com.au'

for n, url in enumerate(df.link.values):
    print(f'Scraping property {n}')
    property_url = f'{url_base}{url}'
    page = requests.get(property_url, headers=headers_info)
    tree = html.fromstring(page.text)
    property_text = ' '.join(tree.xpath(path_text))
    texts.append(property_text)

In [12]:
df = (
    df
    .assign(description=texts)
)

## Write the data

In [14]:
output_folder = '/home/alex/Desktop/Data/scraped/apartments'

df.to_csv(f'{output_folder}/scraped_161022.csv', index=False)