In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import os
from datetime import datetime

def file_config(start_url, i=2):
    # Configuration du driver
    driver_path = r"/usr/local/bin/chromedriver"
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    driver.set_page_load_timeout(6000)
    attempts = i
    while attempts >0:
        try :
            driver.get(start_url)
            driver.maximize_window()
            attempts = 0
        except TimeoutException:
            attempts = attempts - 1
    return driver


driver = file_config(start_url = r'https://www.expat.com/fr/rechercheresultat/afrique/cote-d-ivoire/?q=location&go-search=1&currentDestination=11630')    

wait = WebDriverWait(driver, 30)



In [None]:
data = list()
i = 1

In [None]:
def scrape_data(cards):

    # notif_check()
    links = []
    for i in range(len(cards)):
        link = cards[i].get_attribute('href')
        links.append(link)

    for link in links:
        attempts = 3
        while attempts >0:
            try :
                driver.get(link)
                driver.refresh()
                attempts = 0
            except TimeoutException:
                attempts = attempts - 1

        # notif_check()
        try:
            # Utilisation de BeautifulSoup pour récupérer les détails de l'annonce
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            try :
                details = soup.select_one('#main-content > div > div')
            except AttributeError:
                continue
            details1 = details.select_one('div:nth-child(3) > div.col-md-8 > div.item-attributes-container').text.strip()
            if ('par mois' not in details1) or ('Chambres' not in details1) or ('Salles de bains' not in details1) or ('Surface habitable' not in details1):
                continue
            try:
                # from details_group1
                price = details.select_one(r"div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p.item-attributes-container__row.item-attributes-container__row--price > span").text.strip()
                title = details.select_one(r"div:nth-child(1) > div.col-md-8 > div.housing-content-col.housing-content-col-top.property-item > div.classified-ads-header-container > h1").text.strip()
            except AttributeError:
                continue
            
            try:
                localisation = details.select_one(r"div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p > span.value property-address").text.strip()
                immo_type = None
            except AttributeError:
                localisation = None
                immo_type = None


            # from details_group2
            try:
                superficie = details.select_one(r'div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p:nth-child(6) > span').text.strip()
                nb_pieces = details.select_one(r'#main-content > div > div > div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p:nth-child(4) > span').text.strip()
                nb_salle_de_bain = details.select_one(r'div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p:nth-child(5) > span').text.strip()
            except AttributeError:
                superficie = None
                nb_pieces = None
                nb_salle_de_bain = None

            # from details_group3
            try:
                description = details.select_one('div:nth-child(1) > div.col-md-8 > div.housing-content-col.housing-content-col-top.property-item > div.housing-text').text.strip().replace('\n', ' ')
            except AttributeError:
                description = None

            try:
                annonceur = details.select_one('div:nth-child(3) > div.col-md-8 > div.item-attributes-container > p:nth-child(8) > span').text.strip()
            except AttributeError:
                annonceur = None

            current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            data.append({
                'title': title,
                'price': price,
                'localisation': localisation,
                'superficie': superficie,
                "type d'immobilier": immo_type,
                'nb_pieces': nb_pieces,
                "nb_salle_de_bain": nb_salle_de_bain,
                'scraping_date': current_datetime,
                "annonceur" : annonceur,
                "link" : link,
                'description': description
            })

            print(data[-1])
        except :
            continue

    return data


In [None]:
while True:

    try :
        # Attendre que les éléments soient visibles
        cards_presence = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#search-results-list')))
        # Créer une liste de cartes
        cards = cards_presence.find_elements(By.CSS_SELECTOR, 'div > p.search-result__title > a')
        scrape_data(cards)
        # save_data(data)
        i += 1
        driver.get(f'https://www.expat.com/fr/rechercheresultat/afrique/cote-d-ivoire/{i}/?q=location&go-search=1&currentDestination=11630')
        driver.refresh()
    except :
        print("Une erreur c'est produite lors de la collecte sur expat.")
        break


print("fin du scraping")
driver.quit()

In [None]:
data = pd.DataFrame(data)
data

import locale
# Définir la locale en français
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Obtenir la date et l'heure actuelles
current_datetime = datetime.now()
formatted_date = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
formatted_month = current_datetime.strftime("%B")
formatted_day = current_datetime.strftime("%d")


import platform
if platform.system() == 'Windows':
    dynamic_path = f'D:\\Bureau\\MemoiresStages\\Travaux_techniques\\Scrapping\\Datasets\\{formatted_month}\\{formatted_day}_{formatted_month}\\expat_{formatted_day}_{formatted_month}.csv'
else:
    dynamic_path = f'/mnt/d/Bureau/MemoiresStages/Travaux_techniques/Scrapping/Datasets/{formatted_month}/{formatted_day}_{formatted_month}/expat_{formatted_day}_{formatted_month}.csv'

os.makedirs(os.path.dirname(dynamic_path), exist_ok=True)

data.to_csv(dynamic_path, index=False)