In [1]:
from datetime import datetime,date
from functools import wraps
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException, ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, unquote

import csv
import hashlib
import logging
import os
import requests
import re
import time

from pathlib import Path
from typing import Tuple

In [2]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

In [3]:
comune = 'brindisi'
download_path = './ap'
url_comune = 'https://servizi.comune.brindisi.it/openweb/albo/albo_pretorio.php'

In [4]:
def retry(max_attempts=3, delay=10, exceptions=(Exception,)):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    attempts += 1
                    logging.info(f"Attempt {attempts} failed:", e)
                    time.sleep(delay)
            raise RuntimeError(f"Function {func.__name__} failed after {max_attempts} attempts")
        return wrapper
    return decorator

# Driver manager

In [5]:
class WebDriverManager:
    def __init__(self, DIR_DOWNLOAD):
        self.driver = None

    def start_driver(self, url):
        if not self.driver or (self.is_driver_open() == False):
            opts = Options()
            # start driver
            s = Service(r".\chromedriver.exe")
            prefs = {"download.default_directory": f'{download_path}',
                    "directory_upgrade": True,
                    "profile.default_content_settings.popups": 0}
            opts.add_experimental_option("prefs", prefs)
            self.driver = Chrome(service=s, options=opts)

            # opts = Options()
            # service = Service()

            # prefs = {"download.default_directory": f'{download_path}',
            #         "directory_upgrade": True,
            #         "profile.default_content_settings.popups": 0,
            #         "plugins.always_open_pdf_externally": True  # It will not show PDF directly in chrome
            #         }

            # opts.add_experimental_option('prefs', prefs)
            # opts.add_argument('--no-sandbox')
            # opts.add_argument('--headless')
            # opts.add_argument('--disable-gpu')
            # opts.add_argument('--disable-dev-shm-usage')

            # self.driver = Chrome(service=service,
            #                 options=opts)

            self.driver.get(url)
            # self.check_cookies()
            time.sleep(10) # Waiting for main page to load
        return self.driver  # Return the driver instance
        
    # No cookies button (?)
    # def check_cookies(self):
    #     try: 
    #         time.sleep(2)
    #         cookie_button = self.driver.find_element(By.CLASS_NAME, "btn.btn-primary.mr-2")

    #         cookie_button.click()
    #     except NoSuchElementException:
    #         logging.info(f'Cookies button was not found')
    #         pass
    #     except ElementNotInteractableException:
    #         pass

    def is_driver_open(self):
        if self.driver:
            try:
                # Access a property or method of the driver
                self.driver.current_url
                return True
            except NoSuchWindowException:
                return False
            except Exception as e:
                logging.info(f'Driver seems to be closed: {e}')
                return False
        return False
    
    def get_driver(self, link):
        if not self.is_driver_open():
            logging.warning(f'Driver is not responding, reopening')
            self.driver = self.start_driver(link)
        # else: 
        #     self.driver.get(link)
            # self.check_cookies()

        return self.driver

    def close_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

In [6]:
driver_manager = WebDriverManager(download_path)

In [7]:
driver = driver_manager.get_driver(url_comune)



# Loop through all pages of a section

In [8]:
body_e = driver.find_element(By.ID, 'tabella_albo')

rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

## Processing dates

In [9]:
for r_e in rows_e:
    # Find the div containing 'Data affissione:'
    data_affissione_div = r_e.find_element(By.XPATH, ".//div[label/text()='Data affissione:']")
    # data_affissione_text = data_affissione_div.text.split("Data affissione:")[1].strip()

    # Find the div containing 'Fine Pubblicazione:'
    fine_pubblicazione_div = r_e.find_element(By.XPATH, ".//div[label/text()='Fine Pubblicazione:']")
    # fine_pubblicazione_text = fine_pubblicazione_div.text.split("Fine Pubblicazione:")[1].strip()

    print(data_affissione_div.text, fine_pubblicazione_div.text)
    

06/06/2024 14/06/2024
06/06/2024 21/06/2024
06/06/2024 21/06/2024
06/06/2024 21/06/2024
06/06/2024 21/06/2024
06/06/2024 21/06/2024
06/06/2024 06/07/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024


In [10]:
# while True:
#     try:
#         # Waiting for the tabella element
#         body_e = WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.ID, 'tabella_albo'))
#         )

#         # Getting each row, one for each atto
#         rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

#         # Process rows
#         if rows_e:
#             for r_e in rows_e:
#                 try:
#                     progressivo = r_e.find_element(By.TAG_NAME, 'div').text
#                     print(progressivo)
#                 except StaleElementReferenceException:
#                     # Skip this row if it became stale
#                     continue
#         else:
#             break

#         # Finding next page button
#         next_button = WebDriverWait(driver, 10).until(
#             EC.element_to_be_clickable((By.XPATH, "//a[@class='button' and @title='Successiva']"))
#         )

#         # Use JavaScript to click
#         driver.execute_script("arguments[0].click();", next_button)

#         # Waiting for the new content to load
#         time.sleep(2)

#     except Exception as e:
#         print("No more pages or an error occurred:", e)
#         break


## Getting pubblica number

In [11]:
body_e = driver.find_element(By.ID, 'tabella_albo')

rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

In [12]:
for r_e in rows_e:
    # Find the div containing 'Numero progressivo:'
    pubblica_n = r_e.find_element(By.XPATH, ".//div[label/text()='Numero progressivo:']")

    print(pubblica_n.text)

2024/0002737
2024/0002736
2024/0002735
2024/0002734
2024/0002733
2024/0002732
2024/0002731
2024/0002730
2024/0002729
2024/0002728
2024/0002727
2024/0002726
2024/0002725
2024/0002724
2024/0002723


In [47]:
def get_filename_from_cd(cd):
    if not cd:
        return None
    fname = None
    if 'filename=' in cd:
        fname = cd.split('filename=')[1]
        if '"' in fname:
            fname = fname.split('"')[1]
        else:
            fname = fname.split(';')[0].strip()
    return fname

In [56]:
def download_single_file_external_page(link_file_external: str) -> None:
    try:
        driver.get(link_file_external)
        scarica_button = driver.find_element(By.CLASS_NAME, 'btn.btn-primary')
        link_file = scarica_button.get_attribute('href')

        # Send a HEAD request to check the content type
        response = requests.head(link_file, allow_redirects=True)
        
        if 'Content-Type' in response.headers:
            content_type = response.headers['Content-Type']
            if 'application/pdf' in content_type:
                # This is a PDF file, download it
                response = requests.get(link_file)

                cd = response.headers.get('Content-Disposition')
                file_name = get_filename_from_cd(cd)
                
                with open(file_name, 'wb') as file:
                    file.write(response.content)
    except Exception as e:
        logging.info(f'Could not get file from external link: {e}')


In [57]:
def download_single_file_atto_page(link: str):
    driver.get(link)
        
    # Send a HEAD request to check the content type
    response = requests.head(link, allow_redirects=True)
    
    if 'Content-Type' in response.headers:
        content_type = response.headers['Content-Type']
        if 'application/pdf' in content_type:
            # This is a PDF file, download it
            response = requests.get(link)

            cd = response.headers.get('Content-Disposition')
            file_name = get_filename_from_cd(cd)

            with open(file_name, 'wb') as file:
                file.write(response.content)
        else:
            download_single_file_external_page(link_file_external=link)
    else:
        print(f"No Content-Type header: {link}")

In [58]:
def try_download_file():
    try:
        body_e = driver.find_element(By.CLASS_NAME, 'card-body.pe-0.ps-4.ps-md-5')

        # Find the link element
        list_link_e = body_e.find_elements(By.TAG_NAME, 'a')
        list_link = [l_e.get_attribute('href') for l_e in list_link_e]

        print(f'Found {len(list_link_e)} elements')
        for l in list_link:
            download_single_file_atto_page(l)

    except Exception as e:
        logging.warning(f'Could not download files: {e}')

# Downloading files

In [59]:
# body_e = driver.find_element(By.ID, 'tabella_albo')

# rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

In [60]:
# links_atti = [r_e.find_element(By.TAG_NAME, 'a').get_attribute('href') for r_e in rows_e]

In [61]:
# links_atti

In [62]:
# for link in links_atti:
#     driver.get(link)
#     try_download_file()

In [63]:
driver.get('https://servizi.comune.brindisi.it/openweb/albo/albo_dettagli.php?id=11479')
try_download_file()

Found 4 elements


# Class oggetti Ricerca

In [None]:
class SectionsAlbo:
    def __init__(self, driver_manager: WebDriverManager, section_otion: str, dir_download: str, last_date_update: datetime):
        self.driver_manager = driver_manager
        self.driver = self.get_section(section_option=section_otion)
        self.dir_download = self.check_dir_download(dir_download)
        self.last_date_update = last_date_update
        self.dict_links_oggetti = self.loop_all_pages()


    def check_dir_download(self, dir_download):
        # Creating directory if does not exist:
        if not os.path.exists(dir_download):
            os.makedirs(dir_download)
        return dir_download
    

    def get_n_atto(self, ogg_e: WebElement)->str:
        try: #TODO
            pass
        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get n_atto: {e}')


    def get_start_date(self, ogg_e: WebElement) -> datetime:
        try: #TODO
            pass
        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get start date: {e}')

    
    def get_end_date(self, ogg_e: WebElement) -> datetime:
        try: #TODO
            pass
        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get end date: {e}')

    
    def get_oggetto_link(self, ogg_e: WebElement)->str:
        try:
            a_element = ogg_e.find_element(By.TAG_NAME, 'a')
            return a_element.get_attribute('href')

        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get link from oggetto {e}')
        pass


    @staticmethod
    def rename_file(file_name:str, link_ogg: str)->str:
        # Generate a unique identifier for the file
        id = hashlib.sha256(link_ogg.encode()).hexdigest()[:8]

        return f'{id}_{file_name}'
    

    def download_file(self, file_name:str, link_ogg:str)->Path:
        try: 
            # Download the file
            response = requests.get(link_ogg)

            # Construct the full path of the file
            file_path = Path(self.dir_download, file_name)

            # Save the file
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            return file_path
        except Exception as e:
            logging.warning(f'SectionsAlbo: It was not possible to download file: {e}')  
        pass


    # def get_atti_from_page(self) -> dict:
    #     try:
    #         all_oggetti_section = {}
    #         body_e = self.driver.find_element(By.CLASS_NAME, 'it-list-wrapper')

    #         oggetti_e  = body_e.find_elements(By.XPATH, './ul/li')

    #         for ogg in oggetti_e:
    #             info_atto = ogg.find_element(By.CLASS_NAME, 'link-sublist').text
    #             start_date, _ = self.get_publish_dates(info_atto=info_atto)
    #             if (self.last_date_update == None) or (self.last_date_update < start_date):
    #                 dict_ogg = self.get_main_info_oggetto(oggetto=ogg, 
    #                                                       info_atto=info_atto)

    #                 all_oggetti_section.update(dict_ogg)
            
    #         return all_oggetti_section
    #     except Exception as e:
    #         logging.warning(f'SectionsAlbo: Could not get the atti from this page: {e}')


    # def loop_all_pages(self) -> dict:
    #     try: 
    #         all_oggetti_dict = {}
    #         # selector for page number to get all options
    #         page_s_e = self.driver.find_element(By.ID, 'range_pagina')

    #         pages_e = page_s_e.find_elements(By.TAG_NAME, 'option')

    #         pages = [p.get_attribute('value') for p in pages_e if p.get_attribute('value') != '']

    #         for p in pages:
    #             # Finding page selector in the given page
    #             page_s_e = driver.find_element(By.ID, 'range_pagina')

    #             # Using selenium create a select menu with each element
    #             page_select = Select(page_s_e)
                
    #             # Selecting each page
    #             page_select.select_by_value(p)

    #             time.sleep(2)

    #             dict_section = self.get_atti_from_page()
    #             all_oggetti_dict.update(dict_section)

    #         return all_oggetti_dict

    #     except NoSuchElementException:
    #         pass
    #         return {}
    #     except Exception as e:
    #         logging.warning(f'OggetiRicercaAlbo: Could not get all page numbers: {e}')
    #         return {}
        

    def get_main_info_oggetto(self, oggetto: WebElement, info_atto: str) -> dict:
        n_atto = self.get_n_atto(info_atto=info_atto)
        start_date = self.get_start_date(info_atto=info_atto)
        end_date = self.get_end_date(info_atto=info_atto)
        link = self.get_oggetto_link(ogg_e=oggetto)
        
        return {link: dict(n_atto=n_atto,
                           start_date=start_date,
                           end_date=end_date)}


    def get_dict_objects(self):
        dict_ogg = {}
        logging.info(f'Found {len(self.dict_links_oggetti)} oggetti')
        for link, dict_info in self.dict_links_oggetti.items():
            self.driver.get(link)

            body_e = self.driver.find_element(By.CLASS_NAME, 'documento')
            files_e = body_e.find_elements(By.TAG_NAME, 'a')

            for f_e in files_e:
                # Getting the sibling text where we can find the file name
                sibling_text = driver.execute_script("""
                                                     var a = arguments[0];
                                                     var text = a.nextSibling.nodeValue;
                                                     return text;
                                                     """, f_e).strip()

                pattern_f_name = r'- (.*?) -'

                # Find the file name
                match = re.search(pattern_f_name, sibling_text)
                
                f_name = match.group(1)
                f_link = f_e.get_attribute('href')

                file_name = self.rename_file(f_name, f_link)

                # Update dict_info with new keys and values
                updated_dict_info = dict_info.copy()
                updated_dict_info.update(dict(file_name=file_name,
                                              link=f_link,
                                              internal_path=self.download_file(file_name=file_name, link_ogg=f_link)))

                dict_ogg[f_name] = updated_dict_info

        return dict_ogg


In [None]:
sections_manager = SectionsAlbo(driver_manager=driver_manager,
                                section_otion='MAT01',
                                dir_download=download_path,
                                last_date_update=None)

In [None]:
sections_manager.dict_links_oggetti

In [None]:
sections_manager.get_dict_objects()

In [None]:
def get_all_sections(driver_manager: WebDriverManager,
                     url_comune: str) -> dict:
    
    driver = driver_manager.get_driver(url_comune)

    # find select menu
    select_menu_e = driver.find_element(By.ID, 'tendinaTipoPratiche')

    options_e = select_menu_e.find_elements(By.TAG_NAME, 'option')

    options = [o.get_attribute('value') for o in options_e if o.get_attribute('value') != '']
    sections_names = [o.text.strip() for o in options_e if o.text.strip() != 'Tutti']

    return dict(map(lambda i,j : (i,j) , sections_names,options))
    

In [None]:
sections = get_all_sections(driver_manager=driver_manager,
                            url_comune=url_comune)

In [None]:
sections

In [None]:
for s, opt in sections.items():
    print(s)
    dir_download_section = f'{download_path}/{s}'
    SectionsAlbo(driver_manager=driver_manager,
                 section_otion=opt,
                 dir_download=dir_download_section,
                 last_date_update=None).get_dict_objects()