In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime

# Gathering data

In [2]:
url = 'https://www.boliga.dk/resultat?zipCodes=2720'
r = requests.get(url)
content=r.content
soup=BeautifulSoup(content)

In [3]:
now = datetime.datetime.now()
retrieved = now.strftime(("%Y-%m-%d %H:%M:%S"))

In [4]:
tag_list=soup.find("app-housing-list-results").div.div.next_sibling.find_all("app-housing-list-item")

In [5]:
data_list = []
for tag in tag_list:
    # checking skuffesag i.e. hidden listings - they won't be used
    if tag.find("app-listing-information-hidden"):
        #print("skuffesag")
        continue
        
    # dividing listing container in top, middle and bottom parts for readability
    top_info = tag.find("app-listing-information-lg").div.div
    middle_info = top_info.parent.contents[3].div.div
    bottom_info = middle_info.parent.contents[2].div
    
    # information scraping
    link = tag.a['href']
    address=top_info.div.div.span.string
    #print(address)
    city=address.parent.next_sibling.string
    #print(city)
    price=top_info.div.next_sibling.div.contents[1]
    #print(price)
    price_m2=price.parent.next_sibling.string
    #print(pricePerM2)
    housing_type = middle_info.contents[1].label.contents[1].string
    date_added = middle_info.contents[2].p.contents[0]
    rooms = bottom_info.contents[0].span.string
    area = bottom_info.contents[1].span.string
    energy = bottom_info.contents[2].span.string
    year_built = bottom_info.contents[3].span.string
    ground = bottom_info.contents[4].span.string
    monthly_cost = bottom_info.contents[5].span.string
    
    # appending data dictionnary to data_list. BeautifulSoup converts HTML non-breaking spaces (&nbsp) to \xa0 (unicode),
    # so we need to remove them.
    data_list.append({
        "link": link,
        "address": str(address).replace(u'\xa0', u' '),
        "city": city,
        "housing_type": housing_type,
        "price": str(price).replace(u'\xa0', u' '),
        "price_m2": str(price_m2).replace(u'\xa0', u' '),
        "area": area,
        "rooms": rooms,
        "ground": ground,
        "energy": energy,
        "year_built": year_built,
        "monthly_cost": monthly_cost,
        "date_added": date_added,
        "retrieved": retrieved
    })
    

AttributeError: 'Comment' object has no attribute 'div'

In [13]:
data_list

[{'link': '/bolig/1805981/vinkelager_3_1_th_2720_vanloese',
  'address': 'Vinkelager 3, 1. th., ',
  'city': '2720 Vanløse',
  'housing_type': 'Ejerlejlighed',
  'price': ' 2.275.000 kr. ',
  'price_m2': ' 39.912 kr. / m² ',
  'area': ' 57 m² ',
  'rooms': ' Værelser: 2 ',
  'ground': ' 0 m² ',
  'energy': ' Energimærke: D ',
  'year_built': ' 1957 ',
  'monthly_cost': ' Ejerudgift: 2.483 kr. / md. ',
  'date_added': ' Oprettet 16. jul. 2021 ',
  'retrieved': '2021-07-16 15:35:41'},
 {'link': '/bolig/1805618/flakholmen_23__st_5_2720_vanloese',
  'address': 'Flakholmen 23  st. 5., ',
  'city': '2720 Vanløse',
  'housing_type': 'Ejerlejlighed',
  'price': ' 2.750.000 kr. ',
  'price_m2': ' 40.441 kr. / m² ',
  'area': ' 68 m² ',
  'rooms': ' Værelser: 3 ',
  'ground': ' 0 m² ',
  'energy': ' Energimærke: E ',
  'year_built': ' 1937 ',
  'monthly_cost': ' Ejerudgift: 3.058 kr. / md. ',
  'date_added': ' Oprettet 14. jul. 2021 ',
  'retrieved': '2021-07-16 15:35:41'},
 {'link': '/bolig/180

In [14]:
df = pd.DataFrame(data_list)
df

Unnamed: 0,link,address,city,housing_type,price,price_m2,area,rooms,ground,energy,year_built,monthly_cost,date_added,retrieved
0,/bolig/1805981/vinkelager_3_1_th_2720_vanloese,"Vinkelager 3, 1. th.,",2720 Vanløse,Ejerlejlighed,2.275.000 kr.,39.912 kr. / m²,57 m²,Værelser: 2,0 m²,Energimærke: D,1957,Ejerudgift: 2.483 kr. / md.,Oprettet 16. jul. 2021,2021-07-16 15:35:41
1,/bolig/1805618/flakholmen_23__st_5_2720_vanloese,"Flakholmen 23 st. 5.,",2720 Vanløse,Ejerlejlighed,2.750.000 kr.,40.441 kr. / m²,68 m²,Værelser: 3,0 m²,Energimærke: E,1937,Ejerudgift: 3.058 kr. / md.,Oprettet 14. jul. 2021,2021-07-16 15:35:41
2,/bolig/1805552/frederiksgaards_alle_8b_1_th_27...,"Frederiksgårds Allé 8B, 1. th.,",2720 Vanløse,Ejerlejlighed,2.795.000 kr.,35.379 kr. / m²,79 m²,Værelser: 3,0 m²,Energimærke: D,1934,Ejerudgift: 4.254 kr. / md.,Oprettet 14. jul. 2021,2021-07-16 15:35:41
3,/bolig/1805127/aadalsvej_23b_st_th_2720_vanloese,"Ådalsvej 23B, st. th,",2720 Vanløse,Ejerlejlighed,2.250.000 kr.,40.178 kr. / m²,56 m²,Værelser: 2,0 m²,Energimærke: D,1935,Ejerudgift: 2.676 kr. / md.,Oprettet 13. jul. 2021,2021-07-16 15:35:41
4,/bolig/1805293/jyllingevej_62__2_tv_2720_vanloese,"Jyllingevej 62 2. tv.,",2720 Vanløse,Ejerlejlighed,3.195.000 kr.,28.526 kr. / m²,112 m²,Værelser: 3,0 m²,Energimærke: D,1934,Ejerudgift: 4.359 kr. / md.,Oprettet 13. jul. 2021,2021-07-16 15:35:41
5,/bolig/1804961/kirkebjerg_alle_43_1_tv_2720_va...,"Kirkebjerg Allé 43, 1 tv,",2720 Vanløse,Ejerlejlighed,2.045.000 kr.,40.098 kr. / m²,51 m²,Værelser: 2,0 m²,Energimærke: D,1936,Ejerudgift: 1.783 kr. / md.,Oprettet 12. jul. 2021,2021-07-16 15:35:41
6,/bolig/1804132/groennehoej_9_2_th_2720_vanloese,"Grønnehøj 9, 2. th.,",2720 Vanløse,Ejerlejlighed,2.195.000 kr.,34.841 kr. / m²,63 m²,Værelser: 2,0 m²,Energimærke: D,1936,Ejerudgift: 3.163 kr. / md.,Oprettet 9. jul. 2021,2021-07-16 15:35:41
7,/bolig/1804004/arnestedet_21_st_th_2720_vanloese,"Arnestedet 21, ST. TH,",2720 Vanløse,Ejerlejlighed,2.750.000 kr.,47.413 kr. / m²,58 m²,Værelser: 2,0 m²,Energimærke: D,1935,Ejerudgift: 2.608 kr. / md.,Oprettet 9. jul. 2021,2021-07-16 15:35:41
8,/bolig/1804087/markskellet_11_2_tv_2720_vanloese,"Markskellet 11, 2. tv.,",2720 Vanløse,Ejerlejlighed,3.095.000 kr.,40.194 kr. / m²,77 m²,Værelser: 2,0 m²,Energimærke: C,1936,Ejerudgift: 3.628 kr. / md.,Oprettet 9. jul. 2021,2021-07-16 15:35:41
9,/bolig/1803631/aalekistevej_110b_1_th_2720_van...,"Ålekistevej 110B, 1. th.,",2720 Vanløse,Ejerlejlighed,2.450.000 kr.,45.370 kr. / m²,54 m²,Værelser: 2,0 m²,Energimærke: D,1937,Ejerudgift: 2.240 kr. / md.,Oprettet 8. jul. 2021,2021-07-16 15:35:41


# Cleaning data

- add new column with ID extracted from link
- remove comma at the end of address values
- extract only relevant numbers from combined values
- remove spaces at start and end of strings
- transform date into format that can be converted to DateTime datatype
- put floor and side to lowercase in address
- fix data types

In [15]:
df_clean = df.copy()

In [None]:
df_clean["id"] = df_clean.link.str.split("/")[2]

In [16]:
# remove comma and space at the end of address values
df_clean.address = df_clean.address.str.rstrip(", ")

In [17]:
# extract numbers from strings for relevant columns
df_clean.price = df_clean.price.str.strip().str.rstrip(" kr.").str.replace(".","")

  df_clean.price = df_clean.price.str.strip().str.rstrip(" kr.").str.replace(".","")


In [18]:
df_clean.price_m2 = df_clean.price_m2.str.strip().str.rstrip(" kr. / m²").str.replace(".","")

  df_clean.price_m2 = df_clean.price_m2.str.strip().str.rstrip(" kr. / m²").str.replace(".","")


In [19]:
df_clean.area = df_clean.area.str.strip().str.rstrip(" m²")

In [20]:
df_clean.monthly_cost = df_clean.monthly_cost.str.strip().str.rstrip(" kr. / md.").str.lstrip("Ejerudgift: ")\
                                                                                    .str.replace(".","")

  df_clean.monthly_cost = df_clean.monthly_cost.str.strip().str.rstrip(" kr. / md.").str.lstrip("Ejerudgift: ")\


In [21]:
df_clean.head()

Unnamed: 0,link,address,city,housing_type,price,price_m2,area,rooms,ground,energy,year_built,monthly_cost,date_added,retrieved
0,/bolig/1805981/vinkelager_3_1_th_2720_vanloese,"Vinkelager 3, 1. th.",2720 Vanløse,Ejerlejlighed,2275000,39912,57,Værelser: 2,0 m²,Energimærke: D,1957,2483,Oprettet 16. jul. 2021,2021-07-16 15:35:41
1,/bolig/1805618/flakholmen_23__st_5_2720_vanloese,Flakholmen 23 st. 5.,2720 Vanløse,Ejerlejlighed,2750000,40441,68,Værelser: 3,0 m²,Energimærke: E,1937,3058,Oprettet 14. jul. 2021,2021-07-16 15:35:41
2,/bolig/1805552/frederiksgaards_alle_8b_1_th_27...,"Frederiksgårds Allé 8B, 1. th.",2720 Vanløse,Ejerlejlighed,2795000,35379,79,Værelser: 3,0 m²,Energimærke: D,1934,4254,Oprettet 14. jul. 2021,2021-07-16 15:35:41
3,/bolig/1805127/aadalsvej_23b_st_th_2720_vanloese,"Ådalsvej 23B, st. th",2720 Vanløse,Ejerlejlighed,2250000,40178,56,Værelser: 2,0 m²,Energimærke: D,1935,2676,Oprettet 13. jul. 2021,2021-07-16 15:35:41
4,/bolig/1805293/jyllingevej_62__2_tv_2720_vanloese,Jyllingevej 62 2. tv.,2720 Vanløse,Ejerlejlighed,3195000,28526,112,Værelser: 3,0 m²,Energimærke: D,1934,4359,Oprettet 13. jul. 2021,2021-07-16 15:35:41
