# Problem 1: Scraping house prices

In [None]:
'''
Author: amamiya-yuuko-1225 1913250675@qq.com
Date: 2024-11-11 11:33:16
LastEditors: amamiya-yuuko-1225 1913250675@qq.com
Description: 
'''
import pandas as pd
from bs4 import BeautifulSoup

map_swe_month_to_eng_abbr = {
    "januari": "Jan",
    "februari": "Feb",
    "mars": "Mar",
    "april": "Apr",
    "maj": "May",
    "juni": "Jun",
    "juli": "Jul",
    "augusti": "Aug",
    "september": "Sep",
    "oktober": "Oct",
    "november": "Nov",
    "december": "Dec"
}

'''
description: uniformize date like "10-Feb-2021".
param {str} s: raw date
return {str} formated date
'''
def uniformize_date(s: str) -> str:
    if " " not in s: 
        i = s.rfind('-')
        return s[: i + 1] + "20" + s[i + 1: ]
    else:
        ss = s.split(" ")
        return '-'.join([ss[0], map_swe_month_to_eng_abbr[ss[1]], ss[2]])

'''
description: scrape advertisements from a single html
param {str} path of the html file
return {pd.DataFrame} a pivot table of advertisements
'''
def extract_df_from_html(path: str) -> pd.DataFrame:
    #Reading the HTML File
    with open(path, 'r') as f:
        html = f.read()
    
    #Parsing the HTML    
    soup = BeautifulSoup(html, 'html.parser') 
    
    # Initialize an Empty List
    ad_s_list = list()    #This line creates an empty list named "names"

    #iterate all advertisements
    for cell in soup.find_all('li',class_='sold-results__normal-hit'):
        #store attributes of an ad
        ad_dict = {}

        raw_date = cell.find("span", class_="hcl-label hcl-label--state hcl-label--sold-at").text.strip().replace("Såld ", "")
        ad_dict["Date of sale (dd-m-yyyy)"] = uniformize_date(raw_date)

        ad_dict["Address"] = cell.find("h2", class_="sold-property-listing__heading qa-selling-price-title hcl-card__title").text.strip()

        ad_dict["Location"] = cell.find("div", class_="sold-property-listing__location").div.find_all(string=True, recursive=False)[1].replace(' ', '').replace('\n', '')

        area_and_rum = cell.find("div", class_="sold-property-listing__subheading sold-property-listing__area").text.replace(' ', '').replace('\n', '').split("\u00A0")
        #area_and_rum: ['161+55', 'm²', '5', 'rum']
        ad_dict["Area (m^2)"] = area_and_rum[0]
        ad_dict["No. of rooms"] = area_and_rum[2] if 'rum' in area_and_rum else 'NaN'

        land_area_element = cell.find("div", class_="sold-property-listing__land-area")
        ad_dict["Area of the plot (m^2)"] = land_area_element.text.replace(" ", '').replace('\n', '').replace('\xa0', '')[: -6:] if land_area_element != None else "NaN"

        index_of_slutpris = cell.text.find("Slutpris")
        ad_dict["Closing price (kr)"] = cell.text[index_of_slutpris + len("Slutpris"): cell.text.find("kr", index_of_slutpris): ].replace(" ", '').replace("\u00A0", '')

        ad_s = pd.Series(ad_dict)

        ad_s_list.append(ad_s)
        
    df = pd.concat(ad_s_list, axis = 1) 
    return df.T


In [85]:
import os
DATA_DIR = "/home/amamiya/dsai/lab2/kungalv_slutpriser"

df_list = []

#iterate all html files, scrape and combine data
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        df = extract_df_from_html(os.path.join(root, f))
        df_list.append(df)
       
data = pd.concat(df_list)

data.to_csv("data.csv", index=None, encoding="UTF-8")