In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files

# Function to clean and extract specific fields from the scraped data
def extract_fields(data):
    extracted_data = []
    for entry in data:
        # Extract Gouvernorat, Délégation, Localité from Location
        location_info = entry["Location"]

        # Split the location info into parts
        gouvernorat = "N/A"
        delegation = "N/A"
        localite = "N/A"

        # Use BeautifulSoup to parse the location info and extract text
        location_soup = BeautifulSoup(location_info, 'html.parser')
        location_text = location_soup.get_text().strip()

        # Split the location text into parts
        location_parts = location_text.split("Délégation :")
        if len(location_parts) > 0:
            gouvernorat = location_parts[0].replace("Gouvernorat :", "").strip()

        if len(location_parts) > 1:
            delegation_localite = location_parts[1].split("Localité :")
            delegation = delegation_localite[0].strip()
            if len(delegation_localite) > 1:
                localite = delegation_localite[1].strip()

        # Extract Texte annonce from Description
        texte_annonce = entry["Description"]

        # Extract Prix
        prix = entry["Price"].replace("Dinar Tunisien (TND)", "").strip()

        # Extract Date
        date = entry["Date"]

        # Append the extracted data
        extracted_data.append({
            "Gouvernorat": gouvernorat,
            "Délégation": delegation,
            "Localité": localite,
            "Texte annonce": texte_annonce,
            "Prix": prix,
            "Date": date
        })

    return extracted_data

# Function to scrape a single page
def scrape_page(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {url}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    rows = soup.find_all('tr', class_='Tableau1')
    page_data = []

    for row in rows:
        # Extract location
        location = row.find('a', onmouseover=True)
        location_info = location['onmouseover'].replace("return escape('", "").replace("');", "") if location else "N/A"
        location_info = BeautifulSoup(location_info, 'html.parser').get_text().strip()  # Remove HTML tags

        # Extract description (Texte annonce)
        description = row.find_all('a', onmouseover=True, href=True)[1]  # Second <a> tag contains the description
        description_info = description['onmouseover'].replace("return escape('", "").replace("');", "") if description else "N/A"
        description_info = BeautifulSoup(description_info, 'html.parser').get_text().strip()  # Remove HTML tags

        # Extract price
        price = row.find('td', style="CURSOR:pointer;text-align: right;", onmouseover=True)
        price_info = price['onmouseover'].replace("return escape('", "").replace("');", "") if price else "N/A"
        price_info = BeautifulSoup(price_info, 'html.parser').get_text().strip()  # Remove HTML tags

        # Extract date
        date = row.find_all('td', style="CURSOR:pointer;", onmouseover=True)[1]  # Second <td> with onmouseover contains the date
        date_info = date['onmouseover'].replace("return escape('", "").replace("');", "") if date else "N/A"
        date_info = BeautifulSoup(date_info, 'html.parser').get_text().strip()  # Remove HTML tags
        # Extract only the date part (e.g., "31/01/2025")
        date_info = date_info.split("Insérée le :")[-1].strip().split(" ")[0]

        # Append the extracted data to the list
        page_data.append({
            "Location": location_info,
            "Description": description_info,
            "Price": price_info,
            "Date": date_info
        })

    return page_data

# Scrape all pages
base_url = "http://www.tunisie-annonce.com/AnnoncesImmobilier.asp?rech_cod_cat=1&rech_cod_rub=101&rech_cod_typ=10104&rech_cod_sou_typ=1010402&rech_cod_pay=TN&rech_cod_reg=&rech_cod_vil=&rech_cod_loc=&rech_prix_min=&rech_prix_max=&rech_surf_min=&rech_surf_max=&rech_age=&rech_photo=&rech_typ_cli=&rech_order_by=31&rech_page_num="

all_data = []
page_num = 1

while True:
    url = base_url + str(page_num)
    print(f"Scraping page {page_num}...")
    page_data = scrape_page(url)

    if not page_data:  # Stop if no data is returned (end of pages)
        print("No more pages found.")
        break

    all_data.extend(page_data)
    page_num += 1

print(f"Scraped {len(all_data)} entries.")

# Extract specific fields
extracted_data = extract_fields(all_data)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(extracted_data)

# Display the DataFrame in a table
display(df)

# Save the DataFrame to a CSV file
csv_filename = "scraped_data.csv"
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # Use 'utf-8-sig' for proper encoding

# Download the file
files.download(csv_filename)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

Unnamed: 0,Gouvernorat,Délégation,Localité,Texte annonce,Prix,Date
0,Nabeul,Hammamet,Zone Hoteliere,Train 312 m hammamet sudparticulier vend terr...,78 000,10/10/2024
1,Nabeul,Kelibia,Dar Allouche,Terrain agricole près de la merterrain agricol...,800 000,01/02/2025
2,Nabeul,Korba,Tazarka,Terrain seif tazerkaun lot de terrain de 250 m...,30 000,21/10/2022
3,Ariana,Mnihla,Jardins d el Menzah,Terrain pour des promoteurs au jardin del menz...,11 900 000,25/01/2025
4,Sousse,Kalaa El Kebira,Kalaa El Kebira,Terrain pour investissementagence immobilière ...,329 550,14/10/2022
...,...,...,...,...,...,...
1438,Nabeul,Grombalia,Belli Halte,3 hectares à belli gare grombalial agence imm...,850 000,02/11/2024
1439,Nabeul,Bou Argoub,Borj Hafaiedh,Un hectare dagrumes à bou argoubl agence #immo...,370 000,02/11/2024
1440,Sousse,Akouda,Chatt Meriem,Des terrains vue de mer à koussour gharnatala ...,310 050,13/08/2022
1441,Sousse,Kalaa El Kebira,Kalaa El Kebira,Belle senia clôturé 7520m à kalaa kbira el hen...,450 000,25/07/2024


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>