In [None]:
!pip install selenium webdriver_manager pandas tqdm fake-useragent
!apt-get update
!apt install -y chromium-chromedriver chromium-browser

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random
import re
from fake_useragent import UserAgent

# Configuration optimisée
ua = UserAgent()
user_agent = ua.random

chrome_options = Options()
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--remote-debugging-port=9222')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.binary_location = '/usr/bin/chromium-browser'

def init_driver():
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})
        return driver
    except Exception as e:
        print(f"Erreur d'initialisation : {str(e)}")
        raise

def clean_text(text):
    """Nettoyage approfondi du texte"""
    if pd.isna(text):
        return text
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\xa0', ' ').strip()
    text = text.replace('La Manouba', 'Manouba')
    return text

def load_all_listings(driver, url):
    """Fonction pour charger toutes les annonces en scrollant"""
    driver.get(url)
    time.sleep(random.uniform(5, 8))

    last_height = driver.execute_script("return document.body.scrollHeight")
    listings_count = 0
    new_listings_count = 0
    attempts = 0
    max_attempts = 10  # Maximum d'essais sans nouvelles annonces

    print("Chargement de toutes les annonces...")

    with tqdm(desc="Chargement des annonces") as pbar:
        while attempts < max_attempts:
            # Scroll jusqu'en bas
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(3, 5))

            # Attendre le chargement de nouveaux éléments
            time.sleep(random.uniform(2, 4))

            # Vérifier le nombre actuel d'annonces
            current_listings = driver.find_elements(By.CSS_SELECTOR, "app-ad-preview article.preview-wrapper")
            new_listings_count = len(current_listings)

            if new_listings_count == listings_count:
                attempts += 1
                time.sleep(random.uniform(2, 3))
            else:
                listings_count = new_listings_count
                attempts = 0
                pbar.total = listings_count
                pbar.update(listings_count - pbar.n)

            # Vérifier si on peut cliquer sur "Voir plus"
            try:
                voir_plus = driver.find_element(By.CSS_SELECTOR, "button.show-more-btn")
                driver.execute_script("arguments[0].click();", voir_plus)
                time.sleep(random.uniform(4, 6))
                attempts = 0
            except:
                pass

            # Calculer la nouvelle hauteur de scroll
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                attempts += 1
            last_height = new_height

    print(f"\nTotal d'annonces chargées : {listings_count}")
    return BeautifulSoup(driver.page_source, 'html.parser')

def scrape_all_announces():
    driver = init_driver()
    search_url = "https://9annas.tn/search/Terrain%20agricole"

    try:
        # Charger toutes les annonces
        soup = load_all_listings(driver, search_url)
        all_listings = soup.select('app-ad-preview article.preview-wrapper')

        print(f"\nDébut du traitement de {len(all_listings)} annonces...")

        data = []
        for listing in tqdm(all_listings, desc="Extraction des données"):
            try:
                # Extraction des données
                title = clean_text(listing.select_one('h1.preview-title').text)
                price = clean_text(listing.select_one('h2.preview-price').text)
                desc = clean_text(listing.select_one('div.preview-description p').text)

                # Extraction surface
                surface = ''
                surface_match = re.search(r'Surface:\s*([\d,]+)', desc)
                if surface_match:
                    surface = surface_match.group(1).replace(',', '').strip()

                # Extraction localisation avec nouvelles règles
                location = listing.select_one('div.preview-location-info')
                if location:
                    localite = clean_text(location.select_one('p.preview-location-main-info').text)
                    parts = clean_text(location.select_one('p.preview-location-secondary-info').text).split()

                    # Nouvelle logique pour Gouvernorat et Délégation
                    if len(parts) > 1:
                        gouvernorat = ' '.join(parts[1:])
                        delegation = localite  # On met la Localité dans Délégation
                    else:
                        gouvernorat = parts[0] if parts else ''
                        delegation = localite
                else:
                    localite = delegation = gouvernorat = ''

                # Extraction date
                date = clean_text(listing.select_one('p.preview-date-large, div.preview-date-short').text)
                date = re.sub(r'sur.*', '', date).replace('Depuis', '').strip()

                # Nettoyage prix
                price_clean = re.sub(r'[^\d]', '', price)

                data.append([
                    gouvernorat, delegation, localite,
                    f"{title} {desc}",
                    price_clean,
                    date,
                    surface
                ])
            except Exception as e:
                continue

        return pd.DataFrame(data, columns=[
            'Gouvernorat', 'Délégation', 'Localité',
            'Texte annonce', 'Prix', 'Date', 'Surface'
        ])

    finally:
        driver.quit()

# Exécution
print("Lancement du scraping complet...")
try:
    start_time = time.time()
    data = scrape_all_announces()

    if not data.empty:
        # Post-traitement
        data['Prix'] = pd.to_numeric(data['Prix'], errors='coerce')
        data['Surface'] = pd.to_numeric(data['Surface'], errors='coerce')

        print(f"\n{len(data)} annonces scrapées avec succès en {time.time()-start_time:.2f} secondes!")
        data.to_csv('terrains_agricoles_complet.csv', index=False, encoding='utf-8-sig')

        print("\nAperçu des données :")
        print(data.head().to_markdown(index=False))

        print("\nStatistiques :")
        print(f"Nombre total d'annonces : {len(data)}")
        print(f"Prix moyen : {data['Prix'].mean():,.0f} DT")
        print(f"Surface moyenne : {data['Surface'].mean():,.0f} m²")
    else:
        print("\nAucune donnée récupérée.")

except Exception as e:
    print(f"\nERREUR : {str(e)}")

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

Chargement des annonces: 100%|██████████| 5022/5022 [53:26<00:00,  1.57it/s]



Total d'annonces chargées : 5022

Début du traitement de 5022 annonces...


Extraction des données: 100%|██████████| 5022/5022 [00:04<00:00, 1130.29it/s]



4775 annonces scrapées avec succès en 3245.26 secondes!

Aperçu des données :
| Gouvernorat   | Délégation     | Localité       | Texte annonce                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |    Prix | Date     |   Surface |
|:--------------|:---------------|:---------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
import pandas as pd
from datetime import datetime, timedelta

# Charger le fichier
df = pd.read_csv('terrains_agricoles_complet.csv')  # ou .xlsx si c'est un Excel

# 1. Renommer les gouvernorats - CORRIGÉ "Gouvernorat" au lieu de "Gouvernerat"
replacements = {
    'Médenine': 'Medenine',
    'Arous': 'Ben arous',
    'Bouzid': 'Sidi bouzid',
    'Gabès': 'Gabes',
    'Kef': 'Le Kef',
    'Kébili': 'Kebili'
}

df['Gouvernorat'] = df['Gouvernorat'].replace(replacements)

# 2. Supprimer les lignes sans gouvernorat valide
df = df.dropna(subset=['Gouvernorat'])
df = df[df['Gouvernorat'] != '']

# 3. Transformer le format de date
def convert_to_date(time_str):
    today = datetime(2025, 4, 3)  # Date de scraping que vous avez indiquée

    if 'heure' in time_str:
        hours = int(time_str.split()[0])
        return (today - timedelta(hours=hours)).strftime('%d/%m/%Y')
    elif 'jour' in time_str:
        days = int(time_str.split()[0])
        return (today - timedelta(days=days)).strftime('%d/%m/%Y')
    elif 'mois' in time_str:
        months = int(time_str.split()[0])
        # Approximation: 1 mois = 30 jours
        return (today - timedelta(days=months*30)).strftime('%d/%m/%Y')
    else:
        return today.strftime('%d/%m/%Y')

df['Date'] = df['Date'].apply(convert_to_date)

# 4. Supprimer la colonne Surface
df = df.drop(columns=['Surface'], errors='ignore')

# Aperçu du résultat
print(df.head())

# Sauvegarder le résultat
df.to_csv('Land_Prices_9annas.csv', index=False)

  Gouvernorat      Délégation        Localité  \
0      Nabeul         Kélibia         Kélibia   
1      Nabeul         Kélibia         Kélibia   
2    Kairouan  Kairouan Ville  Kairouan Ville   
3      Nabeul        Hammamet        Hammamet   
4     Manouba    Borj El Amri    Borj El Amri   

                                       Texte annonce     Prix        Date  
0  Un terrain de 2350 mètre carré au cap bon prof...  1410000  02/04/2025  
1  Offre de 913 mètre carré au cap bon un terrain...   730400  02/04/2025  
2  A Vendre Terrain Agricole A vendre , Terrain a...   360000  02/04/2025  
3  Investissement sur je vous offre un terrain ag...   160000  02/04/2025  
4  Terrain agricole de 5000 m² à Borj El Amri À V...   200000  02/04/2025  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4754 entries, 0 to 4774
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Gouvernorat    4754 non-null   object
 1   Délégation     4754 non-null   object
 2   Localité       4754 non-null   object
 3   Texte annonce  4754 non-null   object
 4   Prix           4754 non-null   int64 
 5   Date           4754 non-null   object
dtypes: int64(1), object(5)
memory usage: 260.0+ KB
