In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import os
from datetime import datetime

def file_config(start_url, i=2):
    # Configuration du driver
    driver_path = r"/usr/local/bin/chromedriver"
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    driver.set_page_load_timeout(6000)
    attempts = i
    while attempts >0:
        try :
            driver.get(start_url)
            driver.maximize_window()
            attempts = 0
        except TimeoutException:
            attempts = attempts - 1
    return driver

driver = file_config(start_url = r'https://ci.coinafrique.com/search?category=51&keyword=location&page=1')

wait = WebDriverWait(driver, 30)

def notif_check():
    wait = WebDriverWait(driver, 5)
    try:
        non_merci_button = wait.until(EC.visibility_of_element_located((By.XPATH, '//button[@data-declined-notification=""]')))
        non_merci_button.click()
    except TimeoutException:
        pass

In [None]:
data = []
i = 1

In [None]:
def scrape_data(cards):

    notif_check()
    links = []
    for card in cards:
        link = card.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
        links.append(link)

    for link in links:
        attempts = 3
        while attempts >0:
            try :
                driver.get(link)
                driver.refresh()
                attempts = 0
            except TimeoutException:
                attempts = attempts - 1

        notif_check()
        try:
            # Utilisation de BeautifulSoup pour récupérer les détails de l'annonce
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            try :
                details = soup.select_one('body > main > div.row.fullwidth > div.details__main.mainwidth > div:nth-child(2) > div > div > div')
            except AttributeError:
                continue

            try:
                # from details_group1
                price = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle p.price").text.strip()
                title = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle h1.title-ad").text.strip()
            except AttributeError:
                price = None
                title = None

            try:
                localisation = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle div > div > p > span:nth-child(2) > span").text.strip()
                immo_type = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle div > div > p > span:nth-child(3) > span").text.strip()
            except AttributeError:
                try:
                    localisation = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle div > div > p > span:nth-child(1) > span").text.strip()
                    immo_type = details.select_one("div.ad__info__box.ad__info__box-priceAndTitle div > div > p > span:nth-child(2) > span").text.strip()
                except:
                    localisation = None
                    immo_type = None

            # from details_group2
            try:
                superficie = f"{details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(3) > span.qt').text.strip()} m2"
                nb_pieces = details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(1) > span.qt').text.strip()
                nb_salle_de_bain = details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(2) > span.qt').text.strip()
            except AttributeError:
                try:
                    superficie = None
                    nb_pieces = details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(1) > span.qt').text.strip()
                    nb_salle_de_bain = details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(2) > span.qt').text.strip()
                except AttributeError:
                    try:
                        nb_pieces = f"{details.select_one('div.ad__info__box.ad__info__box-details div > ul > li:nth-child(1) > span.qt').text.strip()}"
                        superficie = None
                        nb_salle_de_bain = None
                    except AttributeError:
                        superficie = None
                        nb_pieces = None
                        nb_salle_de_bain = None

            # from details_group3
            try:
                description = details.select_one('div.ad__info__box.ad__info__box-descriptions p:nth-child(2)').text.strip().replace('\n', ' ')
            except AttributeError:
                description = None

            try:
                annonceur = soup.select_one('body > main > div.row.fullwidth > div.details__main.mainwidth > div.card.z-depth-0.round.card-publications.hide-on-large-only > div > div > div.col.s9.l9 > div > p.username > a').text.strip().replace('\n', ' ')
            except AttributeError:
                annonceur = None
            current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            data.append({
                'title': title,
                'price': price,
                'localisation': localisation,
                'superficie': superficie,
                "type d'immobilier": immo_type,
                'nb_pieces': nb_pieces,
                "nb_salle_de_bain": nb_salle_de_bain,
                'scraping_date': current_datetime,
                "annonceur" : annonceur,
                "link" : link,
                'description': description
            })
        except :
            continue
        print(data[-1])
    return data


In [None]:
while True:
    # Attendre que les éléments soient visibles
    cards_presence = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'body > main > div.container > div.column.four-fifth > div.row.adcard__listing')))
    # Créer une liste de cartes
    cards = cards_presence.find_elements(By.CSS_SELECTOR, 'div.col')

    pagination = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.pagination.custom')))

    # Récupérer tous les éléments <li> dans la pagination
    page_items = pagination.find_elements(By.CSS_SELECTOR, 'ul > li > a')
    
    if len(data) > 6000:
        break
    # Vérifier si le dernier élément a la classe 'disabled'
    if '#' in page_items[-1].get_attribute('href'):
        print("Dernier élément désactivé, fin de la pagination.")

        scrape_data(cards)

        i += 1
        break
        # Ajoutez ici l'action que vous souhaitez exécuter
    else:
        next_link_presence = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'body > main > div.container > div.column.four-fifth > div.pagination.custom > ul')))
        next_link = next_link_presence.find_elements(By.CSS_SELECTOR, 'li > a')
        start_url = next_link[-1].get_attribute('href')

        scrape_data(cards)
        driver.get(start_url)

        print(data)

        i += 1

print("fin du scraping")
driver.quit()



In [None]:
data = pd.DataFrame(data)
data

import locale
# Définir la locale en français
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Obtenir la date et l'heure actuelles
current_datetime = datetime.now()
formatted_date = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
formatted_month = current_datetime.strftime("%B")
formatted_day = current_datetime.strftime("%d")


import platform
if platform.system() == 'Windows':
    dynamic_path = f'D:\\Bureau\\MemoiresStages\\Travaux_techniques\\Scrapping\\Datasets\\{formatted_month}\\{formatted_day}_{formatted_month}\\CoinAfriqueCi_{formatted_day}_{formatted_month}.csv'
else:
    dynamic_path = f'/mnt/d/Bureau/MemoiresStages/Travaux_techniques/Scrapping/Datasets/{formatted_month}/{formatted_day}_{formatted_month}/CoinAfriqueCi_{formatted_day}_{formatted_month}.csv'

os.makedirs(os.path.dirname(dynamic_path), exist_ok=True)

data.to_csv(dynamic_path, index=False)