In [1]:
from datetime import datetime,date
from functools import wraps
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException, StaleElementReferenceException, InvalidSessionIdException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, unquote

import csv
import hashlib
import logging
import os
import requests
import re
import time
import pdb
import threading

from pathlib import Path
from typing import Tuple, Union

In [2]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

In [3]:
comune = 'brindisi'
download_path = './pt'
url_comune = 'https://servizi.comune.brindisi.it/openweb/trasparenza/'

In [4]:
def retry(max_attempts=3, delay=10, exceptions=(Exception,)):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    attempts += 1
                    logging.info(f"Attempt {attempts} failed:", e)
                    time.sleep(delay)
            raise RuntimeError(f"Function {func.__name__} failed after {max_attempts} attempts")
        return wrapper
    return decorator

In [5]:
extensions = ('.pdf', '.odt', '.xls', '.csv', '.ods', '.xlsx', '.doc', '.zip', '.docx', '.sxc')

# Driver manager

In [6]:
class WebDriverManager:
    def __init__(self, DIR_DOWNLOAD):
        self.driver = None
        self.output = DIR_DOWNLOAD

    def start_driver(self, url):
        if not self.driver or (self.is_driver_open() == False):
            opts = Options()
            # start driver
            s = Service(r".\chromedriver.exe")
            prefs = {"download.default_directory": f'{self.output}',
                    "directory_upgrade": True,
                    "profile.default_content_settings.popups": 0}
            opts.add_experimental_option("prefs", prefs)
            self.driver = Chrome(service=s, options=opts)

            # opts = Options()
            # service = Service()

            # prefs = {"download.default_directory": f'{self.output}',
            #         "directory_upgrade": True,
            #         "profile.default_content_settings.popups": 0,
            #         "plugins.always_open_pdf_externally": True
            #         }

            # opts.add_experimental_option('prefs', prefs)
            # opts.add_argument('--no-sandbox')
            # opts.add_argument('--headless')
            # opts.add_argument('--disable-gpu')
            # opts.add_argument('--disable-dev-shm-usage')

            # self.driver = Chrome(service=service,
            #                 options=opts)

            self.driver.get(url)
            time.sleep(10) # Waiting for main page to load
        return self.driver  # Return the driver instance     

    def is_driver_open(self):
        if self.driver:
            try:
                # Access a property or method of the driver
                self.driver.current_url
                return True
            except NoSuchWindowException:
                return False
            except Exception as e:
                logging.info(f'Driver seems to be closed: {e}')
                return False
        return False
    
    def get_driver(self, link):
        if not self.is_driver_open():
            logging.warning(f'Driver is not responding, reopening')
            self.driver = self.start_driver(link)
        else: 
            logging.info('Driver is running, getting items from current section page')
            self.driver.get(link)
        return self.driver

    def close_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

Initialize webdriver, and clicking on cookies button

In [7]:
driver_manager = WebDriverManager(download_path)
driver = driver_manager.get_driver(link=url_comune)



In [12]:
# body_e = driver.find_element(By.ID, 'lista_trasparenza_categorie')   

In [13]:
# sections_e = body_e.find_elements(By.TAG_NAME, 'h3')

In [14]:
# for s_e in sections_e:
#     s_name = s_e.text
#     a_e = s_e.find_element(By.TAG_NAME, 'a')
#     s_link = a_e.get_attribute('href')
#     print(f'{s_name}: {s_link}')

In [15]:
def get_sections() -> dict:
    sections = {}
    driver.get(url_comune)
    body_e = driver.find_element(By.ID, 'lista_trasparenza_categorie')
    div_e = body_e.find_elements(By.XPATH, './div')

    for d_e in div_e:
        s_e = d_e.find_element(By.TAG_NAME, 'h3')
        s_name = s_e.text
        s_a_e = s_e.find_element(By.TAG_NAME, 'a')
        s_link = s_a_e.get_attribute('href')

        sections[s_name] = {'href': s_link}
    
    return sections

In [16]:
# sections = get_sections()

In [17]:
# sections

In [18]:
def get_subsections(s_link: str) -> dict:
    driver.get(s_link)
    try:
        subsections = {}
        body_e = driver.find_element(By.ID, 'lista_servizi_privati')
        div_e = body_e.find_elements(By.XPATH, './div')
        for d_e in div_e:
            a_e = d_e.find_element(By.TAG_NAME, 'a')
            ss_name = a_e.text
            ss_link = a_e.get_attribute('href')

            subsections[ss_name] = {'href': ss_link}
        return subsections
    except NoSuchElementException:
        pass

In [19]:
# for s, s_values in sections.items():
#     s_link = s_values['href']
#     print(f'{s}: {s_link}')
#     subsections = get_subsections(s_link=s_link)
#     if subsections:
#         for ss, ss_values in subsections.items():
#             ss_link = ss_values['href']
#             print(f'\t{ss}: {ss_link}')

In [27]:
class SectionsTableFileExtractor:
    def __init__(self, 
                 driver_manager, 
                 link_section:str, 
                 dir_download: str, 
                 last_date_update: datetime,
                 files_in_db: list[str]):
        self.driver             = driver_manager.get_driver(link=link_section)
        self.link_section       = link_section
        self.dir_download       = self.check_dir_download(dir_download)
        self.last_date_update   = last_date_update
        self.dict_links_oggetti = self.loop_all_pages()
        self.files_in_db        = files_in_db

    def check_dir_download(self, dir_download):
        # Creating directory if does not exist:
        if not os.path.exists(dir_download):
            os.makedirs(dir_download)
        return dir_download

    def get_n_atto(self, ogg_e: WebElement)->str:
        try:
            nr_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Nr. Atto:"]]')
            nr_atto_text = nr_atto_div.text.split('Nr. Atto:')[-1].strip()
            return nr_atto_text
        except NoSuchElementException:
            pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get n_atto: {e}')
    
    def get_start_date(self, ogg_e: WebElement)->str:
        try:
            data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data Atto:"]]')
            data_atto_text = data_atto_div.text.split('Data Atto:')[-1].strip()
            return data_atto_text
        except NoSuchElementException:
            try:
                data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data inizio:"]]')
                data_atto_text = data_atto_div.text.split('Data inizio:')[-1].strip()

                return data_atto_text
            except NoSuchElementException:
                pass
            except Exception as e:
                logging.warning(f'SectionsTable: Could not get start date: {e}')


    def get_end_date(self, ogg_e: WebElement)->str:
        try:
            data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data fine:"]]')
            data_atto_text = data_atto_div.text.split('Data fine:')[-1].strip()

            return data_atto_text
        except NoSuchElementException:
            pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get end date: {e}')

    
    @staticmethod
    def get_oggetto_link(ogg_e: WebElement)->str:
        try:
            a_element = ogg_e.find_element(By.TAG_NAME, 'a')
            return a_element.get_attribute('href')

        except Exception as e:
            logging.warning(f'SectionsTable: Could not get link from oggetto {e}')
        pass


    @staticmethod
    def rename_file(file_name:str, link_ogg: str)->str:
        # Generate a unique identifier for the file
        id = hashlib.sha256(link_ogg.encode()).hexdigest()[:8]

        return f'{id}_{file_name}'
    

    def get_main_info_oggetto(self, ogg_e: WebElement) -> dict:
        n_atto     = self.get_n_atto(ogg_e=ogg_e)
        start_date = self.get_start_date(ogg_e=ogg_e)
        end_date   = self.get_end_date(ogg_e=ogg_e)
        link       = self.get_oggetto_link(ogg_e=ogg_e)
        
        return {link: dict(n_atto=n_atto,
                           start_date=start_date,
                           end_date=end_date)}


    def get_atti_from_page(self) -> dict:
        try:
            all_oggetti_page = {}
            # Waiting for the tabella element
            body_e = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, 'table_delibere'))
            )

            # Getting each row, one for each atto
            rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

            # Process rows, one for atto
            if rows_e:
                for r_e in rows_e:
                    try:
                        start_date = self.get_start_date(r_e)
                        if (self.last_date_update == None) or (self.last_date_update < start_date.date()):
                            dict_ogg = self.get_main_info_oggetto(ogg_e=r_e)

                            all_oggetti_page.update(dict_ogg)
                        else:  # atto already in DB, skipping it
                            pass
                    except StaleElementReferenceException:  # Skip this row if it became stale
                        continue
            else:
                pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get the atti from this page: {e}')

        return all_oggetti_page


    def loop_all_pages(self) -> dict:
        all_oggetti_dict = {}
        self.driver.get(self.link_section)

        page_number = 1
        while True:
            try:
                logging.info(f'Getting atti from page: {page_number}')
                dict_page = self.get_atti_from_page()
                all_oggetti_dict.update(dict_page)

                # Finding next page button
                next_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[@class='button' and @title='Successiva']"))
                )

                # Use JavaScript to click
                self.driver.execute_script("arguments[0].click();", next_button)
                page_number += 1

                # Waiting for the new content to load
                time.sleep(2)

            except Exception as e:
                # print("No more pages or an error occurred:", e)
                break
        return all_oggetti_dict
    

    @staticmethod
    def get_filename_from_cd(cd):
        if not cd:
            return None
        fname = None
        if 'filename=' in cd:
            fname = cd.split('filename=')[1]
            if '"' in fname:
                fname = fname.split('"')[1]
            else:
                fname = fname.split(';')[0].strip()
        return fname
    

    def general_download(self, link_file:str)->Path:
        try: 
            response = requests.get(link_file)

            cd = response.headers.get('Content-Disposition')
            b_name = self.get_filename_from_cd(cd)
            file_name = self.rename_file(b_name, link_file) if b_name else link_file.split('/')[-1]

            # Construct the full path of the file
            file_path = Path(self.dir_download, file_name)
            
            with open(file_path, 'wb') as file:
                file.write(response.content)
            
            return file_name, link_file, file_path
        except Exception as e:
            logging.info(f'SectionsTable: Could not download file from {link_file}: {e}')
    

    def download_single_file_external_page(self, link_file_external: str) -> None:
        try:
            self.driver.get(link_file_external)
            scarica_button = self.driver.find_element(By.CLASS_NAME, 'btn.btn-primary')
            link_full = scarica_button.get_attribute('href')
            link_file = link_full.split('&CSRF')[-2]

            # This is a PDF file, download it
            f_name, f_link, f_path = self.general_download(link_file=link_file)
            return f_name, f_link, f_path
        except Exception as e:
            logging.info(f'SectionsTable: Could not get file from external link: {e}')


    def download_single_file_atto_page(self, link_ogg: str) -> Tuple[str, str, str]:
        """This method will get all the links available for a given atto, it will use the HEAD request 
        to check if it's possible to download a file from the given file, otherwise it probably it's an 
        external link, in this case it will open the external link and download the files from it.

        Args:
            link_ogg (str): Link of the atto.

        Returns:
            Tuple[str, str, str]: Name of the file, Link of the file, Path where it's been saved.
        """
        try:
            # Send a HEAD request to check the content type
            response = requests.head(link_ogg, allow_redirects=True)
            
            if 'Content-Type' in response.headers: 
                content_type = response.headers['Content-Type']
                # If the link has the file, download it
                if 'application/pdf' in content_type:
                    f_name, f_link, f_path = self.general_download(link_file=link_ogg)
                # Else maybe it's an external link, try to download it:
                else:
                    f_name, f_link, f_path = self.download_single_file_external_page(link_file_external=link_ogg)
                return f_name, f_link, f_path
            else:
                pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not download files from {link_ogg}: {e}')
            return None, None, None
        

    def get_single_object(self, link_ogg: str, dict_info: dict, dict_ogg: dict):
        try:
            body_e = self.driver.find_element(By.ID, 'allegati')
            list_link_e = body_e.find_elements(By.TAG_NAME, 'a')
            list_link = [l_e.get_attribute('href') for l_e in list_link_e]

            for l in list_link:
                f_name, f_link, f_path = self.download_single_file_atto_page(l)
                if f_name and f_link and f_path and (f_name not in self.files_in_db):
                    dict_ogg[f_name] = dict(**dict_info,
                                            file_name     = f_name,
                                            link          = f_link,
                                            internal_path = f_path)
        except Exception as e:
            logging.warning(f'SectionsTable: Could not download files: {e}')


    @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
    def get_single_object(self, link_ogg: str, dict_info: dict, dict_ogg: dict):
        self.driver.get(link_ogg)
        try:
            body_e = self.driver.find_element(By.CLASS_NAME, 'card-body.pe-0.ps-4.ps-md-5')
        except NoSuchElementException:
            try:
                body_e = self.driver.find_element(By.ID, 'allegati')
            except:
                logging.warning(f'SectionsTable: Could not get allegati from {link_ogg}: {e}')
        if body_e:
            try:
                list_link_e = body_e.find_elements(By.TAG_NAME, 'a')
                list_link = [l_e.get_attribute('href') for l_e in list_link_e]

                for l in list_link:
                    f_name, f_link, f_path = self.download_single_file_atto_page(l)
                    if f_name and f_link and f_path and (f_name not in self.files_in_db):
                        dict_ogg[f_name] = dict(**dict_info,
                                                file_name     = f_name,
                                                link          = f_link,
                                                internal_path = f_path)
            except Exception as e:
                logging.warning(f'SectionsTable: Could not download files: {e}')

    
    def get_dict_objects(self):
        dict_ogg = {}
        total_oggetti = len(self.dict_links_oggetti)
        logging.info(f'Found {total_oggetti} oggetti to process')

        if self.dict_links_oggetti:
            for index, (link, dict_info) in enumerate(self.dict_links_oggetti.items()):
                # Log progress every 20 items
                if index % 20 == 0 and index > 0:
                    logging.info(f'Processing oggetto {index} of {total_oggetti}')
                try:
                    self.get_single_object(link_ogg  = link,
                                           dict_info = dict_info,
                                           dict_ogg  = dict_ogg)
                except:
                    pass

            logging.info(f'Completed processing all {total_oggetti} oggetti')
        else:
            logging.info(f'SectionsTable: All files up-to-date!')
        return dict_ogg

In [28]:
# sections_test = SectionsTableFileExtractor(driver_manager=driver_manager,
#                            link_section='https://servizi.comune.brindisi.it/openweb/pratiche/registri.php?sezione=provvOrgani',
#                            dir_download=f'{download_path}/Provvedimenti',
#                            last_date_update=None,
#                            files_in_db=[])

In [29]:
# sections_test.dict_links_oggetti

In [30]:
# dict_download = sections_test.get_dict_objects()

In [31]:
sections_test = SectionsTableFileExtractor(driver_manager=driver_manager,
                           link_section='https://servizi.comune.brindisi.it/openweb/pratiche/registri.php?sezione=concorsi',
                           dir_download=f'{download_path}/Concorsi',
                           last_date_update=None,
                           files_in_db=[])

INFO:root:Driver is running, getting items from current section page
INFO:root:Getting atti from page: 1


In [32]:
sections_test.dict_links_oggetti

{'https://servizi.comune.brindisi.it/openweb/pratiche/bando_trasparenza.php?sezione=concorsi&id=1020&CSRF=a74fc953556b186b6a2c1039add0ac56': {'n_atto': None,
  'start_date': None,
  'end_date': None},
 'https://servizi.comune.brindisi.it/openweb/pratiche/bando_trasparenza.php?sezione=concorsi&id=2266&CSRF=a74fc953556b186b6a2c1039add0ac56': {'n_atto': None,
  'start_date': None,
  'end_date': None}}

In [33]:
dict_download = sections_test.get_dict_objects()

INFO:root:Found 2 oggetti to process
INFO:root:Completed processing all 2 oggetti


# Files class

In [None]:
# class FileDataExtractor:
#     def __init__(self, 
#                  base_name: str, 
#                  href: str, 
#                  dir_download: str,
#                  files_in_db: list) -> dict:
#         self.base_name    = base_name
#         self.href         = href
#         self.dir_download = self.check_dir_download(dir_download)
#         self.files_in_db  = files_in_db
#         self.response     = requests.get(href)
#         self.id           = self.get_unique_id()
#         self.file_name    = f'{self.id}_{self.sanitize_file_name(self.base_name)}'
#         self.file_path    = self.download_file()

#     @staticmethod
#     def sanitize_file_name(file_name):
#         # Replace invalid characters with underscores
#         fixed_name = re.sub(r'[\/:*?"<>|]', '_', file_name)
#         return fixed_name

#     def get_unique_id(self)->str:
#         # Generating a unique identifier for the file
#         return hashlib.sha256(self.href.encode()).hexdigest()[:8]
    
#     def check_dir_download(self, dir_download):
#         # Creating directory if does not exist:
#         if not os.path.exists(dir_download):
#             os.makedirs(dir_download)
#         return dir_download

#     def download_file(self):
#         if self.file_name not in self.files_in_db:
#             try:
#                 # Construct the full path of the file
#                 file_path = Path(self.dir_download, self.file_name)

#                 # Save the file
#                 with open(file_path, 'wb') as f:
#                     f.write(self.response.content)

#                 return file_path
#             except:
#                 logging.warning(f'Could not download file {self.href}')
    
#     def get_file_dict(self)->dict:
#         if self.file_path:
#             file_dict = dict(link=self.href,
#                              internal_path=self.file_path)
#             return {self.file_name: file_dict}

# Section class

In [None]:
# class SectionManager():
#     def __init__(self, 
#                  ss_link:str, 
#                  dir_download: str, 
#                  links_visited: list,
#                  files_in_db: list,
#                  driver) -> None:
#         self.ss_link        = ss_link
#         self.dir_download   = dir_download
#         self.links_v        = links_visited # list of links alrady scrapped
#         self.driver         = driver
#         self.files_in_db    = files_in_db
#         self.files_to_write = {}


#     def download_accordion(self) -> None:
#         """Downloading files from hidden menu"""
#         # Expanding attachment menu
#         menu_e = self.driver.find_element(By.ID, 'ui-accordion-1-header-0').find_element(By.TAG_NAME, 'a')
#         self.driver.execute_script("arguments[0].click();", menu_e)

#         allegati_e = self.driver.find_element(By.CLASS_NAME, 'group-documenti.field-group-html-element.documenti')
#         file_elements = allegati_e.find_elements(By.CLASS_NAME, 'file')

#         self.get_list_elements(list_e=file_elements)


#     def download_file_elements(self) -> None:
#         """Downloading files from file elements"""
#         file_elements = self.driver.find_elements(By.CLASS_NAME, 'file')

#         self.get_list_elements(list_e=file_elements)


#     def get_list_elements(self, list_e: list[WebElement]):
#         """This method will take each web element, get the href attribute which 
#         contains the file name, then a unique id will be assing from the link, 
#         this file will be then saved and the dictionary of the files for the given 
#         section will be updated with the info of each file for posterior writing in the DB.

#         Args:
#             list_e (list[WebElement]): List of web elements that we will download 
#             and generate a dictionary with metadata with.
#         """
#         # if len(list_e) > 0:
#         #     logging.info(f'Found {len(list_e)} files to download')
#         for f_e in list_e:
#             a_e = f_e.find_element(By.TAG_NAME, 'a')
#             link_file = a_e.get_attribute('href')
#             if link_file.endswith(extensions):
#                 self.get_single_element(link_file)
#             else:
#                 pass

    
#     def get_single_element(self, link_f) -> None:
#         link_parts = link_f.split('/')
#         file_name=link_parts[-1]

#         file_dict = FileDataExtractor(base_name    = file_name,
#                                       href         = link_f,
#                                       dir_download = self.dir_download,
#                                       files_in_db  = self.files_in_db).get_file_dict()
        
#         if file_dict:
#             self.files_to_write.update(file_dict)

    
#     def get_microsections_field_items(self):
#         subsections = {}
#         try:
#             # self.driver.get(ms_link) ##TODO
#             body_e = self.driver.find_element(By.CLASS_NAME, 'field-items')
#             a_elements = body_e.find_elements(By.TAG_NAME, 'a')

#             for a_e in a_elements:
#                 ms_link = a_e.get_attribute('href')

#                 subsections[ms_link] = {'href': ms_link}
#         except:
#             pass

#         return subsections
    

#     def get_microsections_field_title(self, ms_link: str) -> dict:
#         try: 
#             # self.driver.get(ms_link) ##TODO
#             self.driver.find_element(By.CLASS_NAME, "view-content")
#         except NoSuchElementException:
#             return {}
        
#         page_number = 0
#         subsections = {}
#         previous_links = set()
        
#         while True:
#             self.driver.get(f'{ms_link}?page={page_number}')
        
#             try: 
#                 body_e = self.driver.find_element(By.CLASS_NAME, "view-content")
#             except NoSuchElementException:
#                 return subsections
            
#             content_e = body_e.find_elements(By.CLASS_NAME, 'views-field.views-field-title')

#             if content_e:
#                 current_links = set()
#                 for c_e in content_e:
#                     a_e = c_e.find_element(By.TAG_NAME, 'a')
#                     ms_link = a_e.get_attribute('href')
                    
#                     current_links.add(ms_link)
#                     subsections[ms_link] = {'href': ms_link}
                
#                 # Check if current page links are the same as previous page links
#                 if current_links == previous_links:
#                     return subsections
#                 previous_links = current_links
#             else:
#                 return subsections
            
#             page_number += 1
    

#     def get_microsections_cards(self, ms_link: str) -> dict:
#         try: 
#             # driver.get(ms_link) ## TODO
#             self.driver.find_element(By.ID, "views-bootstrap-grid-1")
#         except NoSuchElementException:
#             return {}
        
#         page_number = 0
#         subsections = {}
#         previous_links = set()
        
#         while True:
#             self.driver.get(f'{ms_link}?page={page_number}')
        
#             try:
#                 body_e = self.driver.find_element(By.ID, "views-bootstrap-grid-1")
#             except NoSuchElementException:
#                 return subsections
            
#             cards_e = body_e.find_elements(By.CLASS_NAME, 'views-field.views-field-title')

#             if cards_e:
#                 for c_e in cards_e:
#                     current_links = set()
#                     a_e = c_e.find_element(By.TAG_NAME, 'a')
#                     ms_link = a_e.get_attribute('href')

#                     current_links.add(ms_link)
#                     subsections[ms_link] = {'href': ms_link}

#                     # Check if current page links are the same as previous page links
#                     if current_links == previous_links:
#                         return subsections
#                     previous_links = current_links
#             else:
#                 return subsections
            
#             page_number += 1


#     def get_all_types_microsections(self, ms_link: str) -> dict:
#         self.driver.get(ms_link)
#         self.links_v.append(ms_link)

#         sections = self.get_microsections_field_items()
#         if not sections:
#             sections = self.get_microsections_cards(ms_link)
#             if not sections:
#                 sections = self.get_microsections_field_title(ms_link)

#         return sections
    

#     def download_recursive_microsections(self, ms_link, depth=0, max_depth=10) -> None:
#         if depth > max_depth:
#             return 

#         microsections = self.get_all_types_microsections(ms_link)
#         if microsections:
#             for values in microsections.values():
#                 link = values['href']
#                 if link and ('https://www.comune.arezzo.it' in link) and (link not in self.links_v):
#                     if link.endswith(extensions):
#                         self.get_single_element(link_f=link)
#                         # Adding the link to those already visited
#                         self.links_v.append(link)
#                     else: 
#                         try:
#                             self.download_accordion()
#                         except:
#                             pass
#                         try:
#                             self.download_file_elements()
#                         except:
#                             pass

#                         # links_visited.append(link)
#                         self.download_recursive_microsections(link, depth + 1, max_depth)
#                 else: 
#                     pass

    
#     def get_files_section(self) -> dict:
#         self.download_recursive_microsections(ms_link=self.ss_link)
#         return self.files_to_write


In [None]:
# s_test = 'http://www.comune.arezzo.it/altri-contenuti'

In [None]:
# section_manager_test = SectionManager(ss_link=s_test,
#                                       dir_download=f'{download_path}/Altri contenuti',
#                                       links_visited=[],
#                                       files_in_db=[],
#                                       driver=driver)

In [None]:
# dict_to_write = section_manager_test.get_files_section()

In [None]:
# class AmministrazioneTrasparenteScrapper:
#     def __init__(self, driver_manager, minio_manager, postgres_manager, url_comune: str, dir_output: str) -> None:
#         self.url_comune       = url_comune
#         self.driver           = driver_manager.get_driver(self.url_comune)
#         self.driver_manager   = driver_manager
#         self.minio_manager    = minio_manager
#         self.postgres_manager = postgres_manager
#         self.sections         = self.get_sections()
#         self.output           = dir_output
#         self.links_visited    = []


#     def close_all_connections(self):
#         logging.info('Closing all connections')
#         self.driver_manager.close_driver()
#         self.postgres_manager.close_connection()
#         self.minio_manager.close_connection()

    
#     @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
#     def get_subsections(self, s_link: str) -> Tuple[dict, list[str]]:
#         try:
#             self.driver.get(s_link)
#             time.sleep(1)

#             body_e = driver.find_element(By.CLASS_NAME, 'field-items')
#             a_elements = body_e.find_elements(By.TAG_NAME, 'a')

#             if a_elements:
#                 subsections = {}
#                 links_download = []
#                 for a_e in a_elements:
#                     ss_link = a_e.get_attribute('href')

#                     if not ss_link.endswith(extensions): 
#                         ss_name = a_e.text.strip()
#                         subsections[ss_name] = ss_link
#                     else:
#                         links_download.append(ss_link)
#             else:
#                 logging.info(f'No subsections found')
#         except NoSuchElementException:
#             pass
#         except Exception as e:
#             logging.warning(f'Could not get subsections for {s_link}: {e}')
#             pass

#         return subsections, links_download

    
#     @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
#     def get_sections(self):
#         sections = {}

#         try:
#             self.driver.get(url_comune)
#             # Click on expand sections button
#             expand_button = self.driver.find_element(By.ID, 'menu-lista-button')
#             expand_button.click()

#             body_e = self.driver.find_element(By.CLASS_NAME, 'nav.navbar-nav.list-group')
#             li_elements = body_e.find_elements(By.TAG_NAME, 'li')

#             s_names = [li_e.text for li_e in li_elements]
#             link_sections = [li_e.find_element(By.TAG_NAME, 'a').get_attribute('href') for li_e in li_elements]

#             for s, s_link in zip(s_names, link_sections):
#                 sections[s] = {'href': s_link}

#         except Exception as e:
#             logging.warning(f'Could not get sections')

#         return sections
    

#     def get_single_element(self, link_f: str, files_to_write: dict, dir_download: str, files_in_db: list[str]) -> None:
#         link_parts = link_f.split('/')
#         file_name=link_parts[-1]

#         file_dict = FileDataExtractor(base_name    = file_name,
#                                       href         = link_f,
#                                       dir_download = dir_download,
#                                       files_in_db  = files_in_db).get_file_dict()
        
#         if file_dict:
#             files_to_write.update(file_dict)
        
#         return files_to_write
    

#     def download_files_section(self, links_download: list[str], dir_download: str, files_in_db: list[str]) -> dict:
#         files_to_write = {}
#         if links_download:
#             for l in links_download:
#                 # Updating files_to_write: 
#                 files_to_write= self.get_single_element(link_f         = l,
#                                                         files_to_write = files_to_write,
#                                                         dir_download   = dir_download,
#                                                         files_in_db    = files_in_db)
#         return files_to_write

    
#     def get_subsections_files(self, ss_link: str, dir_download: str, files_in_db: list[str], files_section: list[str]):
#         """Function that gets all the files for a given subsection (that are not already present in the DB) and writes it to 
#         MinIO and postgres.

#         Args:
#             ss_link (str): Name of the given subsection
#             dir_download (str): Directory of the subsection as a str
#             files_in_db (list[str]): List of file names already written in the DB
#             files_section (list[str]): A list of the links of the files in the section to be downloaded (not subsection)

#         Raises:
#             TimeoutError: If for a given section it takes more than 60 minutes, it will abort the operation
#         """
#         # id_struttura = StrutturaTable(postgres_manager=self.postgres_manager,
#         #                               sottosezione_lv1=s_name,
#         #                               sottosezione_lv2=ss_name).get_id_struttura
#         # files_in_db = DocumentiTable(postgres_manager=self.postgres_manager,
#         #                              id_struttura=id_struttura).get_files_already_in_db()
#         files_in_db = [] ##TODO remove

#         files_section = self.download_files_section(links_download = files_section,
#                                                     dir_download   = dir_download,
#                                                     files_in_db    = files_in_db)
#         print(files_section)
        
#         def abort_process():
#             raise TimeoutError(f"Process for subsection took too long and was aborted.")
        
#         # Start the timer
#         timer = threading.Timer(30 * 60, abort_process)
#         timer.start()

#         try:
#             files_to_write = SectionManager(ss_link       = ss_link,
#                                             dir_download  = dir_download,
#                                             links_visited = self.links_visited,
#                                             files_in_db   = files_in_db,
#                                             driver        = self.driver).get_files_section()
#             files_to_write.update(files_section)
#             print(files_to_write)

#             # Stop the timer if the process completes in time
#             timer.cancel()

#             # WriteFiles(id_struttura=id_struttura,
#             #            dict_files=files_to_write,
#             #            dir_download=self.output,
#             #            postgres_manager=self.postgres_manager,
#             #            minio_manager=self.minio_manager
#             #            ).write_all_documents_to_db()
#         except TimeoutError as e:
#             logging.warning(e)
#         except Exception as e:
#             logging.error(f"An error occurred while processing subsection: {e}")


#     def get_sections_files(self):
#         for s_name, s_values in self.sections.items():
#             s_link = s_values['href']
#             logging.info(f'SUBSECTION: {s_name}')
#             subsections, links_download = self.get_subsections(s_link = s_link)
#             for ss_name, ss_values in subsections.items():
#                 ss_link = ss_values['href']
#                 logging.info(f'\tSUBSECTION: {ss_name}')
#                 dir_dowload = f'{self.output}/{s_name.strip()}/{ss_name.strip()}'
                
#                 self.get_subsections_files(ss_link=ss_link,
#                                            dir_download=dir_dowload,
#                                            files_in_db=[],
#                                            files_section=links_download)
        
#         # self.close_all_connections()

In [None]:
# amministrazione_test = AmministrazioneTrasparenteScrapper(driver_manager=driver_manager,
#                                                           minio_manager=None,
#                                                           postgres_manager=None,
#                                                           url_comune=url_comune,
#                                                           dir_output=download_path)

In [None]:
# amministrazione_test.sections

In [None]:
# amministrazione_test.get_subsections_files(ss_link='http://www.comune.arezzo.it/disposizioni-generali',
#                                            dir_download=f'{download_path}/Disposizioni generali',
#                                            files_in_db=[],
#                                            files_section=['https://www.comune.arezzo.it/sites/default/files/gc_2022_316.pdf', 
#                                                           'https://www.comune.arezzo.it/sites/default/files/allegato_a_piao_2022_2024.pdf'])