In [1]:
import time
import json
import random
import requests
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
def getDataFromPage(url):
    request = requests.get(url)
    data =  None

    if request.status_code == 200:
        soup = BeautifulSoup(request.text, "html.parser")

        data = soup.find_all('script')
        split_data = data[7].text.split("window.__INITIAL_STATE__=")
        data = split_data[1]
    return data

In [3]:
def getPageCount(data):
    result = data["resultList"]["search"]["fullSearch"]["result"]
    page_count = result["pageCount"]

    return page_count

In [4]:
def createDataFrame(url):
    list_rent:list = []
    data = getDataFromPage(url)

    if data:
        json_data = json.loads(data)
        num_pages = getPageCount(json_data)

        for page in tqdm(range(1, num_pages + 1)):
            url_page = f"https://www.immoscout24.ch/de/immobilien/mieten/ort-basel?pn={page}"

            try:
              scraping_data = getDataFromPage(url_page)
              json_scraping_data = json.loads(scraping_data)
              scraping_result = json_scraping_data["resultList"]["search"]["fullSearch"]["result"]
              listings = scraping_result["listings"]
              list_rent.append(listings)
              print(f"Count: {page}")
              time.sleep(random.randint(1, 4))
            except:
              continue

    return list_rent

In [5]:
def createDataset(data):
  dataset:list = []

  try:

    for page in enumerate(data):
      for listing in enumerate(page[1]):

        street = listing[1]["listing"]["address"].get("street") or None,
        located = listing[1]["listing"]["address"].get("postalCode") or None,
        characteristics = listing[1]["listing"].get('characteristics') or None

        if characteristics is not None:
          dataset.append(
              {
                  "id": listing[1].get("id") or None,
                  "categories": listing[1]["listing"].get("categories") or None,
                  "price (CHF)": listing[1]["listing"]["prices"]["rent"].get("gross") or None,
                  "latitude": listing[1]["listing"]["address"]["geoCoordinates"].get("latitude") or None,
                  "longitude": listing[1]["listing"]["address"]["geoCoordinates"].get("longitude") or None,
                  "city": listing[1]["listing"]["address"].get("locality") or None,
                  "living_space (m2)": characteristics.get("livingSpace") or None,
                  "num_rooms": characteristics.get("numberOfRooms") or None,
                  "floor": characteristics.get("floor") or None,
                  "num_floors": characteristics.get("numberOfFloors") or None,
                  "year_built": characteristics.get("yearBuilt") or None,
                  "is_new_building": characteristics.get("isNewBuilding") or None,
                  "is_old_building": characteristics.get("isOldBuilding") or None,
                  "year_last_renovated": characteristics.get("yearLastRenovated") or None,
                  "street": street[0] if street is not None else None,
                  "postal_code": located[0] if located is not None else None,
              }
          )
  except NameError as e:
    print(e)
  return dataset

In [6]:
# URL BASE
url = "https://www.immoscout24.ch/de/immobilien/mieten/ort-basel?nrs=100"
# DATA
data = createDataFrame(url)

  0%|          | 0/48 [00:00<?, ?it/s]

Count: 1


  2%|▏         | 1/48 [00:02<02:12,  2.82s/it]

Count: 2


  4%|▍         | 2/48 [00:06<02:34,  3.37s/it]

Count: 3


  6%|▋         | 3/48 [00:11<02:59,  3.99s/it]

Count: 4


  8%|▊         | 4/48 [00:13<02:19,  3.16s/it]

Count: 5


 10%|█         | 5/48 [00:17<02:27,  3.42s/it]

Count: 6


 12%|█▎        | 6/48 [00:19<02:14,  3.20s/it]

Count: 7


 15%|█▍        | 7/48 [00:23<02:19,  3.41s/it]

Count: 8


 17%|█▋        | 8/48 [00:28<02:34,  3.85s/it]

Count: 9


 19%|█▉        | 9/48 [00:32<02:29,  3.84s/it]

Count: 10


 21%|██        | 10/48 [00:36<02:26,  3.85s/it]

Count: 11


 23%|██▎       | 11/48 [00:40<02:32,  4.12s/it]

Count: 12


 25%|██▌       | 12/48 [00:44<02:25,  4.05s/it]

Count: 13


 27%|██▋       | 13/48 [00:47<02:08,  3.66s/it]

Count: 14


 29%|██▉       | 14/48 [00:51<02:05,  3.69s/it]

Count: 15


 31%|███▏      | 15/48 [00:56<02:12,  4.03s/it]

Count: 16


 33%|███▎      | 16/48 [00:59<02:05,  3.93s/it]

Count: 17


 35%|███▌      | 17/48 [01:02<01:52,  3.62s/it]

Count: 18


 38%|███▊      | 18/48 [01:04<01:34,  3.16s/it]

Count: 19


 40%|███▉      | 19/48 [01:08<01:36,  3.33s/it]

Count: 20


 42%|████▏     | 20/48 [01:12<01:37,  3.47s/it]

Count: 21


 44%|████▍     | 21/48 [01:14<01:20,  2.97s/it]

Count: 22


100%|██████████| 48/48 [01:17<00:00,  1.61s/it]


In [7]:
dataset = createDataset(data)

In [8]:
df = pd.DataFrame(dataset)
df.to_csv("rents_basel.csv", index=False)