In [1]:
import time
import json
import random
import requests
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup

In [7]:
def getDataFromPage(url):
    request = requests.get(url)
    data =  None

    if request.status_code == 200:
        soup = BeautifulSoup(request.text, "html.parser")

        data = soup.find_all('script')
        split_data = data[7].text.split("window.__INITIAL_STATE__=")
        data = split_data[1]
    return data

In [8]:
def getPageCount(data):
    result = data["resultList"]["search"]["fullSearch"]["result"]
    page_count = result["pageCount"]

    return page_count

In [24]:
def createDataFrame(url):
    list_rent:list = []
    data = getDataFromPage(url)

    if data:
        json_data = json.loads(data)
        num_pages = getPageCount(json_data)

        for page in tqdm(range(1, num_pages + 1)):
            url_page = f"https://www.immoscout24.ch/en/real-estate/rent/city-basel?pn={page}&r=50000"

            try:
              scraping_data = getDataFromPage(url_page)
              json_scraping_data = json.loads(scraping_data)
              scraping_result = json_scraping_data["resultList"]["search"]["fullSearch"]["result"]
              listings = scraping_result["listings"]
              list_rent.append(listings)
              print(f"Count: {page}")
              time.sleep(random.randint(1, 4))
            except:
              continue

    return list_rent

In [73]:
def createDataset(data):
  dataset:list = []

  try:

    for page in enumerate(data):
      for listing in enumerate(page[1]):

        street = listing[1]["listing"]["address"].get("street") or None,
        located = listing[1]["listing"]["address"].get("postalCode") or None,

        dataset.append(
            {
                "id": listing[1].get("id") or None,
                "categories": listing[1]["listing"].get("categories") or None,
                "living_space (m2)": listing[1]["listing"]['characteristics'].get("livingSpace") or None,
                "num_rooms": listing[1]["listing"]['characteristics'].get("numberOfRooms") or None,
                "floor": listing[1]["listing"]['characteristics'].get("floor") or None,
                "num_floors": listing[1]["listing"]['characteristics'].get("numberOfFloors") or None,
                "price (CHF)": listing[1]["listing"]["prices"]["rent"].get("gross") or None,
                "year_built": listing[1]["listing"]['characteristics'].get("yearBuilt") or None,
                "is_new_building": listing[1]["listing"]['characteristics'].get("isNewBuilding") or None,
                "is_old_building": listing[1]["listing"]['characteristics'].get("isOldBuilding") or None,
                "year_last_renovated": listing[1]["listing"]['characteristics'].get("yearLastRenovated") or None,
                "latitude": listing[1]["listing"]["address"]["geoCoordinates"].get("latitude") or None,
                "longitude": listing[1]["listing"]["address"]["geoCoordinates"].get("longitude") or None,
                "city": listing[1]["listing"]["address"].get("locality") or None,
                "street": street[0] if street is not None else None,
                "postal_code": located[0] if located is not None else None,
            }
        )
  except NameError as e:
    print(e)
  return dataset

In [71]:
# URL BASE
url = "https://www.immoscout24.ch/en/real-estate/rent/city-basel?r=50000"
# DATA
data = createDataFrame(url)

In [74]:
dataset = createDataset(data)

In [78]:
df = pd.DataFrame(dataset)
df.to_csv("rents_basel.csv", index=False)