In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import os
from datetime import datetime

def file_config(start_url, i=2):
    # Configuration du driver
    driver_path = r"/usr/local/bin/chromedriver"
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    driver.set_page_load_timeout(6000)
    attempts = i
    while attempts >0:
        try :
            driver.get(start_url)
            driver.maximize_window()
            attempts = 0
        except TimeoutException:
            attempts = attempts - 1
    return driver


driver = file_config(start_url = r'https://aici.ci/fr/recherche-immobilier?typeoffre=3&typebien=All&field_nbre_pieces=All&page=0')

wait = WebDriverWait(driver, 100)

data = list()

In [2]:
def scrape_data(cards):
    links = []
    for card in cards:
        link = card.find_element(By.CSS_SELECTOR, 'div.col-sm-12.bien-ville_titre > div.bien-titre > a').get_attribute('href')
        links.append(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        localisation = soup.select_one("div.col-sm-12.bien-ville_titre > div.bien_ville").text.strip()

    for link in links:
        attempts = 3
        while attempts >0:
            try :
                driver.get(link)
                driver.refresh()
                attempts = 0
            except TimeoutException:
                attempts = attempts - 1


        # Utilisation de BeautifulSoup pour récupérer les détails de l'annonce
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        try :
            details = soup.select_one('body > div.dialog-off-canvas-main-canvas > div.main-container.container.js-quickedit-main-content > div > section > div.region.region-content > article > div')
        except AttributeError:
            continue

        try:
            price = soup.select_one("div.row.bs-2col-bricked > div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-prix-en-fcfa.clearfix > div").text.strip()
            title = soup.select_one("div.row.bs-2col-bricked > div.col-sm-8.bs-region.bs-region--top-left > section.block.block-layout-builder.block-field-blocknodebiens-immobilierstitle.clearfix > span").text.strip()
        except AttributeError:
          continue

        immo_type = details.select_one("div.row.bs-2col-bricked > div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-type-de-bien.clearfix > div").text.strip()
        # from details_group2
        try:
            superficie = f"{details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-surface.clearfix').text.strip()}"
            nb_pieces = details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-nbre-pieces.clearfix').text.strip()
            nb_salle_de_bain = details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-salles-bain.clearfix').text.strip()
        except AttributeError:
            try:
                superficie = None
                nb_pieces = details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-nbre-pieces.clearfix').text.strip()
                nb_salle_de_bain = details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-salles-bain.clearfix').text.strip()
            except AttributeError:
                try:
                    nb_pieces = f"{details.select_one('div.col-sm-4.bs-region.bs-region--top-right > section.block.block-layout-builder.block-field-blocknodebiens-immobiliersfield-nbre-pieces.clearfix').text.strip()}"
                    superficie = None
                    nb_salle_de_bain = None
                except AttributeError:
                    superficie = None
                    nb_pieces = None
                    nb_salle_de_bain = None

        # from details_group3
        try:
            description = details.select_one('div.col-sm-8.bs-region.bs-region--top-left').text.strip().replace('\n', ' ')
        except AttributeError:
            description = None

        annonceur = "AICI"
        current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        data.append({
            'title': title,
            'price': price,
            'localisation': localisation,
            'superficie': superficie,
            "type d'immobilier": immo_type,
            'nb_pieces': nb_pieces,
            "nb_salle_de_bain": nb_salle_de_bain,
            'scraping_date': current_datetime,
            "annonceur" : annonceur,
            "link" : link,
            'description': description
        })

        print(data[-1])
        print(len(data))

i = 1

In [None]:
while True:

    try:
        # Attendre que les éléments soient visibles
        cards_presence = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#views-bootstrap-recherche-immobilier-page-1')))
        # Créer une liste de cartes
        cards = cards_presence.find_elements(By.CSS_SELECTOR, 'div > div > div.views-field.views-field-nothing > span > div > div')
        
        if len(data) == 1000:
            break
        
        try:
            page_item = driver.find_element(By.CSS_SELECTOR, 'body > div > div.main-container.container.js-quickedit-main-content > div > section > div.region.region-content > div > div > nav > ul > li.pager__item.pager__item--next')
            next_link = page_item.find_element(By.CSS_SELECTOR, 'a')
            start_url = next_link.get_attribute('href')
            scrape_data(cards)
            driver.get(start_url)
            print(len(data))
            print(f"page_{i}")
            i += 1
        except NoSuchElementException :
            scrape_data(cards)
            print(len(data))
            print(f"page_{i}")
            i += 1
            break
    except:
        print("Une erreur s'est produit lors de la collecte sur AICI")
        break

print("fin du scraping")

In [5]:
data = pd.DataFrame(data)
data

import locale
# Définir la locale en français
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Obtenir la date et l'heure actuelles
current_datetime = datetime.now()
formatted_date = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
formatted_month = current_datetime.strftime("%B")
formatted_day = current_datetime.strftime("%d")


import platform
if platform.system() == 'Windows':
    dynamic_path = f'D:\\Bureau\\MemoiresStages\\Travaux_techniques\\Scrapping\\Datasets\\{formatted_month}\\{formatted_day}_{formatted_month}\\AICI_{formatted_day}_{formatted_month}.csv'
else:
    dynamic_path = f'/mnt/d/Bureau/MemoiresStages/Travaux_techniques/Scrapping/Datasets/{formatted_month}/{formatted_day}_{formatted_month}/AICI_{formatted_day}_{formatted_month}.csv'

os.makedirs(os.path.dirname(dynamic_path), exist_ok=True)

data.to_csv(dynamic_path, index=False)