In [None]:
# Import necessary libraries to run the code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import math
import re
from tqdm import tqdm
import random

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def get_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-notifications")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

In [None]:
def baseURL():
    codes = pd.read_csv("downloaded_files/code-postaux-belge.csv",sep=";")
    codes = codes[['Code', 'Localite']]
    codes['Localite'] = codes['Localite'].str.replace(' ', '-', regex=False).str.replace("'", "", regex=False)
    
    codes = codes.apply(lambda row: str(row.Code) + '-' + row.Localite, axis=1)
    
    rootURL = 'https://immovlan.be/fr/immobilier?transactiontypes=a-vendre&propertytypes=maison,appartement&towns='

    baseURLs = []
    for code in codes:
        url = rootURL + code + '&page='
        baseURLs.append(url)
    
    return baseURLs
    

In [None]:
def scrape_listing_data():
    base = baseURL()[:10]  # limit for testing purposes
    data = []

    for b in tqdm(base, desc="Scraping URLs"):
        url = b + '1'
        driver = get_driver()
        driver.get(url)
 
        try:
            agree_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-agree-button"]/span'))
            )
            agree_button.click()
        except Exception:
            pass

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        total_results = None
        total_pages = None

        result_div = soup.find('div', class_='col-12 mb-2')
        if result_div:
            text = result_div.text.strip()
            match = re.search(r'^\d+', text)
            if match:
                total_results = int(match.group())
                total_pages = math.ceil(total_results / 20)

        data.append({
            'url': b,
            'total_results': total_results,
            'total_pages': total_pages
        })

        # Random delay to mimic human behavior
        time.sleep(random.uniform(1, 3))

    driver.quit()
    
    df = pd.DataFrame(data)

    # Optional save to CSV
    df.to_csv("Scrape_TotalPages.csv", index=False)
    
    return df

In [None]:
def ads():

    pages = scrape_listing_data()
    links = []

    for i in tqdm(range(5), desc="Scraping page links"):
#   for i in range(5):        #range(len(ad))
        ad = pages.iloc[i]
        

        for p in range(1, ad['total_pages']+1):
            url = ad['url'] + str(p) 

            driver = get_driver()
            driver.get(url)

            # Wait for the button to be clickable and click it
            try:
                agree_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-agree-button"]/span'))
                )
                url = agree_button.click()

            except Exception as e:
                pass

            html = driver.page_source
            soup = BeautifulSoup(html, "lxml")

            for a in soup.find_all('a', href=True):
                if '/detail/' in a['href']:       
                    if a['href'] not in links:
                        links.append(a['href'])

            time.sleep(random.uniform(1, 3))  # polite delay

    driver.quit()
    return links
      #  print(f"Total links scraped: {len(links)}")
      #  print(*links, sep="\n")

In [None]:
links = ads()
print(f"Total links scraped: {len(links)}")
print(*links, sep="\n")

In [None]:
df = pd.DataFrame({'ad_URL':links})
df.to_csv("Ad_listing.csv", index=False)
df.head()
print(len(df))

In [None]:
driver = get_driver()

#for i in tqdm(range(len(df)), desc="Adverts scraped:"):
for i in tqdm(range(500,550), desc="Adverts scraped:"):
    try:
        url = df.loc[i, "ad_URL"]    
        driver.get(url)

        soup = BeautifulSoup(driver.page_source, "lxml")     

        # First extracting isolated pieces in container
        title = soup.find('span', class_="detail__header_title_main")
        ref = soup.find('span', class_="vlancode")
        price = soup.find('span', class_="detail__header_price_data")    
        address_block = soup.find('div', class_='d-lg-block d-none')
        description = soup.find('div', class_="dynamic-description active")

        # then cleaning the data
        title = title.contents[0].strip() if title else ""
        ref = ref.text.strip() if ref else ""
        if price:
            price = price.text.strip() 
            price = re.sub(r"[^\d]", "", price)            
        else: ""     
        if address_block:
            spans = address_block.find_all('span')
            address = spans[0].text.strip() 
            zip = spans[1].text.strip()
        else:
            address = ""
            zip = ""
        description  = description.text.strip() if description else ""

        # and adding it to the dataframe
        df.loc[i, "Ref"] = ref
        df.loc[i, "Titre"] = title
        df.loc[i, "Prix"] = price
        df.loc[i, "Addresse"] = address
        df.loc[i, "Localite"] = zip 
        df.loc[i, "Description"] = description       
    
        # then all relevant details are in the class general-info w-100
        # iterating through all div, p, h4 to retrieve the fields
        # keeping the html labels as codes and retriving their values to add to the dataframe
        info_section = soup.find("div", class_="general-info w-100")
        if info_section:
            for data_row in info_section.find_all("div", class_="data-row-wrapper"):
                for item in data_row.find_all("div"):
                    h4 = item.find("h4")
                    p = item.find("p")
                    if h4 and p:
                        label = h4.get_text(strip=True)
                        value = p.get_text(strip=True)
                        # Optional: shorten label or sanitize column name
                        label_clean = re.sub(r"[^\w\s]", "", label).strip().replace(" ", "_")
                        df.loc[i, label_clean] = value
    
    except: print(f"No container found for index {i} — {url}")

driver.quit()
print(df.head(20))

In [None]:
print(df.loc[15])