In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import os
from datetime import datetime

def file_config(start_url, i=2):
    # Configuration du driver
    driver_path = r"/usr/local/bin/chromedriver"
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    driver.set_page_load_timeout(6000)
    attempts = i
    while attempts >0:
        try :
            driver.get(start_url)
            driver.maximize_window()
            attempts = 0
        except TimeoutException:
            attempts = attempts - 1
    return driver

driver = file_config(start_url = r'https://batirici-immobilier.com/location/')    

wait = WebDriverWait(driver, 60)


data = []

def form_check():
    wait = WebDriverWait(driver, 5)
    try:
        non_merci_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#footer-contact-form > div.contact_close_button')))
        non_merci_button.click()
    except TimeoutException:
        pass
def scrape_data(cards):  
      
    links = []
    for card in cards:
        link = card.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
        links.append(link)  
    
    for link in links:
        attempts = 3
        while attempts >0:
            try :
                driver.get(link)
                driver.refresh()
                attempts = 0
            except TimeoutException:
                attempts = attempts - 1
        form_check()

        try:
            # Utilisation de BeautifulSoup pour récupérer les détails de l'annonce
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            try :
                details = soup.select_one('#all_wrapper > div > div.container.content_wrapper > div')
            except AttributeError:
                continue    
            
            try:
                # from details_group1
                price = details.select_one("div.notice_area.col-md-12 > div.price_area").text.strip().replace('\t', '').replace('\n', '')
                title = details.select_one("div.notice_area.col-md-12 > h1").text.strip().replace('\t', '').replace('\n', '')
            except AttributeError:
                price = None
                title = None
                
            try:
                localisation = details.select_one("div.notice_area.col-md-12 > div.property_categs").text.strip()
                immo_type = details.select_one("div.col-md-9.rightmargin.full_width_prop > div > div.wpestate_property_media_section_wrapper > div.status-wrapper.verticalstatus > div").text.strip()
            except AttributeError:
                localisation = None
                immo_type = None
                
            # from details_group2
            try:
                superficie = f"{details.select_one('#single-overview-section > div > ul:nth-child(4) > li:nth-child(2)').text.strip()} m2"
                nb_chambres = soup.select_one('#single-overview-section > div > ul:nth-child(2) > li:nth-child(2)').text.strip().split(' ')[0]
                nb_salle_de_bain = details.select_one('#single-overview-section > div > ul:nth-child(3) > li:nth-child(2)').text.strip().split(' ')[0]
            except AttributeError:
                superficie = None
                nb_chambres = None
                nb_salle_de_bain = None

            # from details_group3
            try:
                description = details.select_one('#wpestate_property_description_section > p').text.strip().replace('\n', ' ')
            except AttributeError:
                description = None
            
            try:
                annonceur = soup.select_one('#sidebar_contact > div > div.agent_unit_widget_sidebar_wrapper > div > div.agent_unit_widget_sidebar_details_wrapper').text.strip().replace('\n', ' ')
            except AttributeError:
                annonceur = None
            current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            data.append({
                'title': title,
                'price': price,
                'localisation': localisation,
                'superficie': superficie,
                "type d'immobilier": immo_type,
                'nb_pieces': nb_chambres,
                "nb_salle_de_bain": nb_salle_de_bain,
                'scraping_date': current_datetime,
                "annonceur" : annonceur,
                "link" : link,
                'description': description
            })
        except :
            continue
        print(data[-1])
        print(len(data))
        
    return data

i = 1

In [None]:
try:
    cards = driver.find_elements(By.CSS_SELECTOR, 'div.property-unit-information-wrapper')

    links = []


    scrape_data(cards)

    print(len(data))

    print(f"page_{i}")
    i += 1

    driver.quit()
except:
    print("Une erreur s'est produit lors de la collecte sur Batirici")

In [None]:
data = pd.DataFrame(data)
data

import locale
# Définir la locale en français
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Obtenir la date et l'heure actuelles
current_datetime = datetime.now()
formatted_date = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
formatted_month = current_datetime.strftime("%B")
formatted_day = current_datetime.strftime("%d")


import platform
if platform.system() == 'Windows':
    dynamic_path = f'D:\\Bureau\\MemoiresStages\\Travaux_techniques\\Scrapping\\Datasets\\{formatted_month}\\{formatted_day}_{formatted_month}\\Batirici_{formatted_day}_{formatted_month}.csv'
else:
    dynamic_path = f'/mnt/d/Bureau/MemoiresStages/Travaux_techniques/Scrapping/Datasets/{formatted_month}/{formatted_day}_{formatted_month}/Batirici_{formatted_day}_{formatted_month}.csv'

os.makedirs(os.path.dirname(dynamic_path), exist_ok=True)

data.to_csv(dynamic_path, index=False)