In [1]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install selenium


Collecting selenium
  Obtaining dependency information for selenium from https://files.pythonhosted.org/packages/c9/33/b9da8be5b122b8c3c82c35f515ba0a84a9af3ba9629ae9fd5bbba820d592/selenium-4.23.1-py3-none-any.whl.metadata
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Obtaining dependency information for trio~=0.17 from https://files.pythonhosted.org/packages/77/34/461280cc615614a9e434c6e23373371a2436f71d045303cadabb1d775eba/trio-0.26.0-py3-none-any.whl.metadata
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Obtaining dependency information for trio-websocket~=0.9 from https://files.pythonhosted.org/packages/48/be/a9ae5f50cad5b6f85bd2574c2c923730098530096e170c1ce7452394d7aa/trio_websocket-0.11.1-py3-none-any.whl.metadata
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting typing_extensions~=4.9 (from selenium)
  Obtaining dependency info

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set up the Selenium WebDriver
driver = webdriver.Chrome()  # Ensure the path to chromedriver is correct

try:
    # URL of the page to scrape
    url = 'https://www.amenbank.com.tn/fr/reseau-agences.html'
    driver.get(url)
    
    # Wait until the governorate dropdown is available
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'gov')))

    # Extract governorate options
    select = Select(driver.find_element(By.ID, 'gov'))
    governorates = [option.get_attribute('value') for option in select.options if option.get_attribute('value').strip()]

    # Prepare a list to hold the data
    data = []

    # Iterate over each governorate
    for gov in governorates:
        logging.info(f"Processing governorate: {gov}")
        
        # Select the governorate from the dropdown
        select.select_by_value(gov)
        
        # Click the search button
        search_button = driver.find_element(By.CLASS_NAME, 'SearchAgency')
        search_button.click()
        
        # Wait for the results to load
        time.sleep(5)  # Adjust the sleep time if necessary
        
        # Parse the page content
        try:
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Find all agency blocks
            agencies = soup.find_all('div', class_='box_item bg-white mb-3')
            
            # Extract the data for each agency
            for agency in agencies:
                name = agency.find('h6').text.strip()
                
                # Extract the address, city, phone, and email
                p_tags = agency.find_all('p')
                
                address = p_tags[0].text.strip() if len(p_tags) > 0 else ''
                city = p_tags[1].text.strip() if len(p_tags) > 1 else ''
                phone = p_tags[2].text.strip() if len(p_tags) > 2 else ''
                email = p_tags[3].text.strip() if len(p_tags) > 3 else ''
                
                # Append the data to the list
                data.append({
                    'Governorate': gov,
                    'Name': name,
                    'Address': address,
                    'City': city,
                    'Phone': phone,
                    'Email': email
                })
                
        except Exception as e:
            logging.error(f"Failed to parse data for governorate: {gov}. Error: {str(e)}")
    
except Exception as e:
    logging.error(f"An error occurred: {str(e)}")

finally:
    # Close the WebDriver
    driver.quit()

# Create a DataFrame from the list
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('agencies.csv', index=False, encoding='utf-8')

logging.info("Data has been written to agencies.csv")


2024-07-29 09:43:50,256 - INFO - Processing governorate: Ariana
2024-07-29 09:43:55,379 - INFO - Processing governorate: Béja
2024-07-29 09:44:00,556 - INFO - Processing governorate: Ben Arous
2024-07-29 09:44:05,775 - INFO - Processing governorate: Bizerte
2024-07-29 09:44:10,923 - INFO - Processing governorate: Gabes
2024-07-29 09:44:16,101 - INFO - Processing governorate: Gafsa
2024-07-29 09:44:21,257 - INFO - Processing governorate: Jendouba
2024-07-29 09:44:26,476 - INFO - Processing governorate: Kairouan
2024-07-29 09:44:31,647 - INFO - Processing governorate: Kasserine
2024-07-29 09:44:36,799 - INFO - Processing governorate: Kebili
2024-07-29 09:44:41,950 - INFO - Processing governorate: Kef
2024-07-29 09:44:47,181 - INFO - Processing governorate: Mahdia
2024-07-29 09:44:52,284 - INFO - Processing governorate: Manouba
2024-07-29 09:44:57,425 - INFO - Processing governorate: Mednine
2024-07-29 09:45:02,698 - INFO - Processing governorate: Monastir
2024-07-29 09:45:07,840 - INFO -

In [1]:
df.head(20)

NameError: name 'df' is not defined