In [1]:
# Import necessary libraries to run the code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import math
import re
from tqdm import tqdm
import random

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def get_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-notifications")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

In [3]:
def baseURL():
    codes = pd.read_csv("downloaded_files/code-postaux-belge.csv",sep=";")
    codes = codes[['Code', 'Localite']]
    codes['Localite'] = codes['Localite'].str.replace(' ', '-', regex=False).str.replace("'", "", regex=False)
    
    codes = codes.apply(lambda row: str(row.Code) + '-' + row.Localite, axis=1)
    
    rootURL = 'https://immovlan.be/fr/immobilier?transactiontypes=a-vendre&propertytypes=maison,appartement&towns='

    baseURLs = []
    for code in codes:
        url = rootURL + code + '&page='
        baseURLs.append(url)
    
    return baseURLs
    

In [4]:
def scrape_listing_data():
    base = baseURL()[:10]  # limit for testing purposes
    data = []

    for b in tqdm(base, desc="Scraping URLs"):
        url = b + '1'
        driver = get_driver()
        driver.get(url)
 
        try:
            agree_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-agree-button"]/span'))
            )
            agree_button.click()
        except Exception:
            pass

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        total_results = None
        total_pages = None

        result_div = soup.find('div', class_='col-12 mb-2')
        if result_div:
            text = result_div.text.strip()
            match = re.search(r'^\d+', text)
            if match:
                total_results = int(match.group())
                total_pages = math.ceil(total_results / 20)

        data.append({
            'url': b,
            'total_results': total_results,
            'total_pages': total_pages
        })

        # Random delay to mimic human behavior
        time.sleep(random.uniform(1, 3))

    driver.quit()
    
    df = pd.DataFrame(data)

    # Optional save to CSV
    df.to_csv("Scrape_TotalPages.csv", index=False)
    
    return df

In [5]:
def ads():

    pages = scrape_listing_data()
    links = []

    for i in tqdm(range(5), desc="Scraping page links"):
#   for i in range(5):        #range(len(ad))
        ad = pages.iloc[i]
        

        for p in range(1, ad['total_pages']+1):
            url = ad['url'] + str(p) 

            driver = get_driver()
            driver.get(url)

            # Wait for the button to be clickable and click it
            try:
                agree_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-agree-button"]/span'))
                )
                url = agree_button.click()

            except Exception as e:
                pass

            html = driver.page_source
            soup = BeautifulSoup(html, "lxml")

            for a in soup.find_all('a', href=True):
                if '/detail/' in a['href']:       
                    if a['href'] not in links:
                        links.append(a['href'])

            time.sleep(random.uniform(1, 3))  # polite delay

    driver.quit()
    return links
      #  print(f"Total links scraped: {len(links)}")
      #  print(*links, sep="\n")

In [6]:
links = ads()
print(f"Total links scraped: {len(links)}")
print(*links, sep="\n")

Scraping URLs: 100%|██████████| 10/10 [01:33<00:00,  9.34s/it]
Scraping page links: 100%|██████████| 5/5 [13:06<00:00, 157.34s/it]


Total links scraped: 1359
https://immovlan.be/fr/detail/duplex/a-vendre/1000/bruxelles/vbc89694
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/vbc89261
https://immovlan.be/fr/detail/maison/a-vendre/1000/bruxelles/vbc87245
https://immovlan.be/fr/detail/maison/a-vendre/1000/bruxelles/vbc91207
https://immovlan.be/fr/detail/maison/a-vendre/1000/bruxelles/vbc91080
https://immovlan.be/fr/detail/immeuble-mixte/a-vendre/1000/bruxelles/vbc91049
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/vbc90875
https://immovlan.be/fr/detail/maison/a-vendre/1000/bruxelles/vbc90874
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/vbc90725
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/rbt71788
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/rbt71787
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/rbt71786
https://immovlan.be/fr/detail/appartement/a-vendre/1000/bruxelles/vbc90698
https://immovlan.be/f

Locality
Type of property (House/apartment)
Subtype of property (Bungalow, Chalet, Mansion, ...)
Price
Type of sale (Exclusion of life sales)
Number of rooms
Living Area
Fully equipped kitchen (Yes/No)
Furnished (Yes/No)
Open fire (Yes/No)
Terrace (Yes/No)
If yes: Area
Garden (Yes/No)
If yes: Area
Surface of the land
Surface area of the plot of land
Number of facades
Swimming pool (Yes/No)
State of the building (New, to be renovated, ...)

In [9]:
df = pd.DataFrame({'URL':links})
df.to_csv("Ad_listing.csv", index=False)

url = df.loc[0, "URL"]

driver = get_driver()
driver.get(url)

html = driver.page_source
soup = BeautifulSoup(html, "lxml")

# <span class="detail__header_title_main"> Duplex à vendre <span class="d-none d-lg-inline">- Bruxelles</span> <span class="vlancode">VBC89694</span> </span>
title_span = soup.find('span', class_="detail__header_title_main")
if title_span:
    # Get all text parts, including nested spans
    full_text = title_span.get_text(separator=" ", strip=True)

    # Extract title (first part) and ref (vlancode)
    raw_text = title_span.contents[0].strip()  # "Duplex à vendre"
    ref_span = title_span.find('span', class_="vlancode")
    ref = ref_span.text.strip() if ref_span else ""

    # Add to dataframe
    df.loc[0, "Ref"] = ref
    df.loc[0, "Title"] = raw_text
 
else:
    print("Title span not found.")

# <span class="detail__header_price_data">  299 000 €  </span>
price_span = soup.find('span', class_="detail__header_price_data")
if price_span:
    # Get all text parts, including nested spans
    price = price_span.text.strip()
    df.loc[0, "Price"] = price
else:
    print("Price span not found.")







# time.sleep(random.uniform(1, 3))  # polite delay

driver.quit()

# print()

In [10]:
df.head()

Unnamed: 0,URL,Ref,Title,Price
0,https://immovlan.be/fr/detail/duplex/a-vendre/...,VBC89694,Duplex à vendre,299 000 €
1,https://immovlan.be/fr/detail/appartement/a-ve...,,,
2,https://immovlan.be/fr/detail/maison/a-vendre/...,,,
3,https://immovlan.be/fr/detail/maison/a-vendre/...,,,
4,https://immovlan.be/fr/detail/maison/a-vendre/...,,,
