In [14]:
from datetime import datetime,date
from functools import wraps
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException, StaleElementReferenceException, InvalidSessionIdException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, unquote

import csv
import hashlib
import logging
import os
import requests
import re
import time
import pdb
import threading

from pathlib import Path
from typing import Tuple, Union

In [2]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

In [3]:
comune = 'arezzo'
download_path = './pt'
url_comune = 'https://www.comune.arezzo.it/area-tematica/amministrazione-trasparente'

In [4]:
def retry(max_attempts=3, delay=10, exceptions=(Exception,)):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    attempts += 1
                    logging.info(f"Attempt {attempts} failed:", e)
                    time.sleep(delay)
            raise RuntimeError(f"Function {func.__name__} failed after {max_attempts} attempts")
        return wrapper
    return decorator

In [5]:
extensions = ('.pdf', '.odt', '.xls', '.csv', '.ods', '.xlsx', '.doc', '.zip', '.docx', '.sxc')

# Driver manager

In [6]:
class WebDriverManager:
    def __init__(self, DIR_DOWNLOAD):
        self.driver = None
        self.output = DIR_DOWNLOAD

    def start_driver(self, url):
        if not self.driver or (self.is_driver_open() == False):
            opts = Options()
            # start driver
            s = Service(r".\chromedriver.exe")
            prefs = {"download.default_directory": f'{self.output}',
                    "directory_upgrade": True,
                    "profile.default_content_settings.popups": 0}
            opts.add_experimental_option("prefs", prefs)
            self.driver = Chrome(service=s, options=opts)

            # opts = Options()
            # service = Service()

            # prefs = {"download.default_directory": f'{self.output}',
            #         "directory_upgrade": True,
            #         "profile.default_content_settings.popups": 0,
            #         "plugins.always_open_pdf_externally": True
            #         }

            # opts.add_experimental_option('prefs', prefs)
            # opts.add_argument('--no-sandbox')
            # opts.add_argument('--headless')
            # opts.add_argument('--disable-gpu')
            # opts.add_argument('--disable-dev-shm-usage')

            # self.driver = Chrome(service=service,
            #                 options=opts)

            self.driver.get(url)
            self.check_cookies()
            time.sleep(10) # Waiting for main page to load
        return self.driver  # Return the driver instance
        

    def check_cookies(self):
        try: 
            time.sleep(2)
            cookie_button = self.driver.find_element(By.CLASS_NAME, "agree-button.eu-cookie-compliance-secondary-button")

            cookie_button.click()
        except NoSuchElementException:
            pass

    def is_driver_open(self):
        if self.driver:
            try:
                # Access a property or method of the driver
                self.driver.current_url
                return True
            except NoSuchWindowException:
                return False
            except Exception as e:
                logging.info(f'Driver seems to be closed: {e}')
                return False
        return False
    
    def get_driver(self, link):
        if not self.is_driver_open():
            logging.warning(f'Driver is not responding, reopening')
            self.driver = self.start_driver(link)
        else: 
            logging.info('Driver is running, getting items from current section page')
            self.driver.get(link)
            self.check_cookies()

        return self.driver

    def close_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

Initialize webdriver, and clicking on cookies button

In [7]:
driver_manager = WebDriverManager()
driver = driver_manager.get_driver(link=url_comune)



In [8]:
def get_sections_and_subsections():
    sections = {}

    try:
        driver.get(url_comune)
        # Click on expand sections button
        expand_button = driver.find_element(By.ID, 'menu-lista-button')
        expand_button.click()

        body_e = driver.find_element(By.CLASS_NAME, 'nav.navbar-nav.list-group')
        li_elements = body_e.find_elements(By.TAG_NAME, 'li')

        s_names = [li_e.text for li_e in li_elements]
        link_sections = [li_e.find_element(By.TAG_NAME, 'a').get_attribute('href') for li_e in li_elements]

        for s, s_link in zip(s_names, link_sections):
            try:
                # print(f'SECTION: {s}')
                driver.get(s_link)
                time.sleep(1)

                body_e = driver.find_element(By.CLASS_NAME, 'field-items')
                a_elements = body_e.find_elements(By.TAG_NAME, 'a')

                if a_elements:
                    subsections = {}
                    for a_e in a_elements:
                        ss_name = a_e.text.strip()
                        # print(f'\tSUBSECTION: {ss_name}')
                        ss_link = a_e.get_attribute('href')

                        subsections[ss_name] = ss_link

                    sections[s] = {'href': s_link, **subsections}
                else:
                    sections[s] = {'href': s_link}
            except NoSuchElementException:
                pass
            except Exception as e:
                logging.warning(f'Could not get subsections for {s}: {e}')
                pass

    except Exception as e:
        logging.warning(f'Could not get sections')

    return sections

# Files class

In [9]:
class FileDataExtractor:
    def __init__(self, 
                 base_name: str, 
                 href: str, 
                 dir_download: str,
                 files_in_db: list) -> dict:
        self.base_name    = base_name
        self.href         = href
        self.dir_download = self.check_dir_download(dir_download)
        self.files_in_db  = files_in_db
        self.response     = requests.get(href)
        self.id           = self.get_unique_id()
        self.file_name    = f'{self.id}_{self.sanitize_file_name(self.base_name)}'
        self.file_path    = self.download_file()

    @staticmethod
    def sanitize_file_name(file_name):
        # Replace invalid characters with underscores
        fixed_name = re.sub(r'[\/:*?"<>|]', '_', file_name)
        return fixed_name

    def get_unique_id(self)->str:
        # Generating a unique identifier for the file
        return hashlib.sha256(self.href.encode()).hexdigest()[:8]
    
    def check_dir_download(self, dir_download):
        # Creating directory if does not exist:
        if not os.path.exists(dir_download):
            os.makedirs(dir_download)
        return dir_download

    def download_file(self):
        if self.file_name not in self.files_in_db:
            try:
                # Construct the full path of the file
                file_path = Path(self.dir_download, self.file_name)

                # Save the file
                with open(file_path, 'wb') as f:
                    f.write(self.response.content)

                return file_path
            except:
                logging.warning(f'Could not download file {self.href}')
    
    def get_file_dict(self)->dict:
        if self.file_path:
            file_dict = dict(link=self.href,
                             internal_path=self.file_path)
            return {self.file_name: file_dict}

# Section class

In [20]:
class SectionManager():
    def __init__(self, 
                 ss_link:str, 
                 dir_download: str, 
                 links_visited: list,
                 files_in_db: list,
                 driver) -> None:
        self.ss_link        = ss_link
        self.dir_download   = dir_download
        self.links_v        = links_visited # list of links alrady scrapped
        self.driver         = driver
        self.files_in_db    = files_in_db
        self.files_to_write = {}


    def download_accordion(self) -> None:
        """Downloading files from hidden menu"""
        # Expanding attachment menu
        menu_e = self.driver.find_element(By.ID, 'ui-accordion-1-header-0').find_element(By.TAG_NAME, 'a')
        self.driver.execute_script("arguments[0].click();", menu_e)

        allegati_e = self.driver.find_element(By.CLASS_NAME, 'group-documenti.field-group-html-element.documenti')
        file_elements = allegati_e.find_elements(By.CLASS_NAME, 'file')

        self.get_list_elements(list_e=file_elements)


    def download_file_elements(self) -> None:
        """Downloading files from file elements"""
        file_elements = self.driver.find_elements(By.CLASS_NAME, 'file')

        self.get_list_elements(list_e=file_elements)


    def get_list_elements(self, list_e: list[WebElement]):
        """This method will take each web element, get the href attribute which 
        contains the file name, then a unique id will be assing from the link, 
        this file will be then saved and the dictionary of the files for the given 
        section will be updated with the info of each file for posterior writing in the DB.

        Args:
            list_e (list[WebElement]): List of web elements that we will download 
            and generate a dictionary with metadata with.
        """
        # if len(list_e) > 0:
        #     logging.info(f'Found {len(list_e)} files to download')
        for f_e in list_e:
            a_e = f_e.find_element(By.TAG_NAME, 'a')
            link_file = a_e.get_attribute('href')
            if link_file.endswith(extensions):
                self.get_single_element(link_file)
            else:
                pass

    
    def get_single_element(self, link_f) -> None:
        link_parts = link_f.split('/')
        file_name=link_parts[-1]

        file_dict = FileDataExtractor(base_name    = file_name,
                                      href         = link_f,
                                      dir_download = self.dir_download,
                                      files_in_db  = self.files_in_db).get_file_dict()
        
        if file_dict:
            self.files_to_write.update(file_dict)

    
    def get_microsections_field_items(self):
        subsections = {}
        try:
            # self.driver.get(ms_link) ##TODO
            body_e = self.driver.find_element(By.CLASS_NAME, 'field-items')
            a_elements = body_e.find_elements(By.TAG_NAME, 'a')

            for a_e in a_elements:
                ms_link = a_e.get_attribute('href')

                subsections[ms_link] = {'href': ms_link}
        except:
            pass

        return subsections
    

    def get_microsections_field_title(self, ms_link: str) -> dict:
        try: 
            # self.driver.get(ms_link) ##TODO
            self.driver.find_element(By.CLASS_NAME, "view-content")
        except NoSuchElementException:
            return {}
        
        page_number = 0
        subsections = {}
        previous_links = set()
        
        while True:
            self.driver.get(f'{ms_link}?page={page_number}')
        
            try: 
                body_e = self.driver.find_element(By.CLASS_NAME, "view-content")
            except NoSuchElementException:
                return subsections
            
            content_e = body_e.find_elements(By.CLASS_NAME, 'views-field.views-field-title')

            if content_e:
                current_links = set()
                for c_e in content_e:
                    a_e = c_e.find_element(By.TAG_NAME, 'a')
                    ms_link = a_e.get_attribute('href')
                    
                    current_links.add(ms_link)
                    subsections[ms_link] = {'href': ms_link}
                
                # Check if current page links are the same as previous page links
                if current_links == previous_links:
                    return subsections
                previous_links = current_links
            else:
                return subsections
            
            page_number += 1
    

    def get_microsections_cards(self, ms_link: str) -> dict:
        try: 
            # driver.get(ms_link) ## TODO
            self.driver.find_element(By.ID, "views-bootstrap-grid-1")
        except NoSuchElementException:
            return {}
        
        page_number = 0
        subsections = {}
        previous_links = set()
        
        while True:
            self.driver.get(f'{ms_link}?page={page_number}')
        
            try:
                body_e = self.driver.find_element(By.ID, "views-bootstrap-grid-1")
            except NoSuchElementException:
                return subsections
            
            cards_e = body_e.find_elements(By.CLASS_NAME, 'views-field.views-field-title')

            if cards_e:
                for c_e in cards_e:
                    current_links = set()
                    a_e = c_e.find_element(By.TAG_NAME, 'a')
                    ms_link = a_e.get_attribute('href')

                    current_links.add(ms_link)
                    subsections[ms_link] = {'href': ms_link}

                    # Check if current page links are the same as previous page links
                    if current_links == previous_links:
                        return subsections
                    previous_links = current_links
            else:
                return subsections
            
            page_number += 1


    def get_all_types_microsections(self, ms_link: str) -> dict:
        self.driver.get(ms_link)
        self.links_v.append(ms_link)

        sections = self.get_microsections_field_items()
        if not sections:
            sections = self.get_microsections_cards(ms_link)
            if not sections:
                sections = self.get_microsections_field_title(ms_link)

        return sections
    

    def download_recursive_microsections(self, ms_link, depth=0, max_depth=10) -> None:
        if depth > max_depth:
            return 

        microsections = self.get_all_types_microsections(ms_link)
        if microsections:
            for values in microsections.values():
                link = values['href']
                if link and ('https://www.comune.arezzo.it' in link) and (link not in self.links_v):
                    if link.endswith(extensions):
                        self.get_single_element(link_f=link)
                        # Adding the link to those already visited
                        self.links_v.append(link)
                    else: 
                        try:
                            self.download_accordion()
                        except:
                            pass
                        try:
                            self.download_file_elements()
                        except:
                            pass

                        # links_visited.append(link)
                        self.download_recursive_microsections(link, depth + 1, max_depth)
                else: 
                    pass

    
    def get_files_section(self) -> dict:
        self.download_recursive_microsections(ms_link=self.ss_link)
        return self.files_to_write


In [21]:
s_test = 'http://www.comune.arezzo.it/altri-contenuti'

In [22]:
section_manager_test = SectionManager(ss_link=s_test,
                                      dir_download=f'{download_path}/Altri contenuti',
                                      links_visited=[],
                                      files_in_db=[],
                                      driver=driver)

In [23]:
dict_to_write = section_manager_test.get_files_section()

In [28]:
class AmministrazioneTrasparenteScrapper:
    def __init__(self, driver_manager, minio_manager, postgres_manager, url_comune: str, dir_output: str) -> None:
        self.url_comune       = url_comune
        self.driver           = driver_manager.get_driver(self.url_comune)
        self.driver_manager   = driver_manager
        self.minio_manager    = minio_manager
        self.postgres_manager = postgres_manager
        self.sections         = self.get_sections()
        self.output           = dir_output
        self.links_visited    = []


    def close_all_connections(self):
        logging.info('Closing all connections')
        self.driver_manager.close_driver()
        self.postgres_manager.close_connection()
        self.minio_manager.close_connection()

    
    @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
    def get_subsections(self, s_link: str) -> Tuple[dict, list[str]]:
        try:
            self.driver.get(s_link)
            time.sleep(1)

            body_e = driver.find_element(By.CLASS_NAME, 'field-items')
            a_elements = body_e.find_elements(By.TAG_NAME, 'a')

            if a_elements:
                subsections = {}
                links_download = []
                for a_e in a_elements:
                    ss_link = a_e.get_attribute('href')

                    if not ss_link.endswith(extensions): 
                        ss_name = a_e.text.strip()
                        subsections[ss_name] = ss_link
                    else:
                        links_download.append(ss_link)
            else:
                logging.info(f'No subsections found')
        except NoSuchElementException:
            pass
        except Exception as e:
            logging.warning(f'Could not get subsections for {s_link}: {e}')
            pass

        return subsections, links_download

    
    @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
    def get_sections(self):
        sections = {}

        try:
            self.driver.get(url_comune)
            # Click on expand sections button
            expand_button = self.driver.find_element(By.ID, 'menu-lista-button')
            expand_button.click()

            body_e = self.driver.find_element(By.CLASS_NAME, 'nav.navbar-nav.list-group')
            li_elements = body_e.find_elements(By.TAG_NAME, 'li')

            s_names = [li_e.text for li_e in li_elements]
            link_sections = [li_e.find_element(By.TAG_NAME, 'a').get_attribute('href') for li_e in li_elements]

            for s, s_link in zip(s_names, link_sections):
                sections[s] = {'href': s_link}

        except Exception as e:
            logging.warning(f'Could not get sections')

        return sections
    

    def get_single_element(self, link_f: str, files_to_write: dict, dir_download: str, files_in_db: list[str]) -> None:
        link_parts = link_f.split('/')
        file_name=link_parts[-1]

        file_dict = FileDataExtractor(base_name    = file_name,
                                      href         = link_f,
                                      dir_download = dir_download,
                                      files_in_db  = files_in_db).get_file_dict()
        
        if file_dict:
            files_to_write.update(file_dict)
        
        return files_to_write
    

    def download_files_section(self, links_download: list[str], dir_download: str, files_in_db: list[str]) -> dict:
        files_to_write = {}
        if links_download:
            for l in links_download:
                # Updating files_to_write: 
                files_to_write= self.get_single_element(link_f         = l,
                                                        files_to_write = files_to_write,
                                                        dir_download   = dir_download,
                                                        files_in_db    = files_in_db)
        return files_to_write

    
    def get_subsections_files(self, ss_link: str, dir_download: str, files_in_db: list[str], files_section: list[str]):
        """Function that gets all the files for a given subsection (that are not already present in the DB) and writes it to 
        MinIO and postgres.

        Args:
            ss_link (str): Name of the given subsection
            dir_download (str): Directory of the subsection as a str
            files_in_db (list[str]): List of file names already written in the DB
            files_section (list[str]): A list of the links of the files in the section to be downloaded (not subsection)

        Raises:
            TimeoutError: If for a given section it takes more than 60 minutes, it will abort the operation
        """
        # id_struttura = StrutturaTable(postgres_manager=self.postgres_manager,
        #                               sottosezione_lv1=s_name,
        #                               sottosezione_lv2=ss_name).get_id_struttura
        # files_in_db = DocumentiTable(postgres_manager=self.postgres_manager,
        #                              id_struttura=id_struttura).get_files_already_in_db()
        files_in_db = [] ##TODO remove

        files_section = self.download_files_section(links_download = files_section,
                                                    dir_download   = dir_download,
                                                    files_in_db    = files_in_db)
        print(files_section)
        
        def abort_process():
            raise TimeoutError(f"Process for subsection took too long and was aborted.")
        
        # Start the timer
        timer = threading.Timer(30 * 60, abort_process)
        timer.start()

        try:
            files_to_write = SectionManager(ss_link       = ss_link,
                                            dir_download  = dir_download,
                                            links_visited = self.links_visited,
                                            files_in_db   = files_in_db,
                                            driver        = self.driver).get_files_section()
            files_to_write.update(files_section)
            print(files_to_write)

            # Stop the timer if the process completes in time
            timer.cancel()

            # WriteFiles(id_struttura=id_struttura,
            #            dict_files=files_to_write,
            #            dir_download=self.output,
            #            postgres_manager=self.postgres_manager,
            #            minio_manager=self.minio_manager
            #            ).write_all_documents_to_db()
        except TimeoutError as e:
            logging.warning(e)
        except Exception as e:
            logging.error(f"An error occurred while processing subsection: {e}")


    def get_sections_files(self):
        for s_name, s_values in self.sections.items():
            s_link = s_values['href']
            logging.info(f'SUBSECTION: {s_name}')
            subsections, links_download = self.get_subsections(s_link = s_link)
            for ss_name, ss_values in subsections.items():
                ss_link = ss_values['href']
                logging.info(f'\tSUBSECTION: {ss_name}')
                dir_dowload = f'{self.output}/{s_name.strip()}/{ss_name.strip()}'
                
                self.get_subsections_files(ss_link=ss_link,
                                           dir_download=dir_dowload,
                                           files_in_db=[],
                                           files_section=links_download)
        
        # self.close_all_connections()

In [29]:
amministrazione_test = AmministrazioneTrasparenteScrapper(driver_manager=driver_manager,
                                                          minio_manager=None,
                                                          postgres_manager=None,
                                                          url_comune=url_comune,
                                                          dir_output=download_path)

INFO:root:Driver is running, getting items from current section page


In [30]:
amministrazione_test.sections

{'Disposizioni generali': {'href': 'http://www.comune.arezzo.it/disposizioni-generali'},
 'Organizzazione': {'href': 'http://www.comune.arezzo.it/organizzazione'},
 'Consulenti e collaboratori': {'href': 'http://www.comune.arezzo.it/consulenti-collaboratori'},
 'Personale': {'href': 'http://www.comune.arezzo.it/personale'},
 'Bandi di concorso': {'href': 'http://www.comune.arezzo.it/bandi-concorso'},
 'Performance': {'href': 'http://www.comune.arezzo.it/performance'},
 'Enti controllati': {'href': 'http://www.comune.arezzo.it/enti-controllati'},
 'Attività e procedimenti': {'href': 'http://www.comune.arezzo.it/attivita-procedimenti'},
 'Provvedimenti': {'href': 'http://www.comune.arezzo.it/provvedimenti'},
 'Controlli sulle imprese (Obbligo abrogato)': {'href': 'http://www.comune.arezzo.it/controlli-sulle-imprese-obbligo-abrogato'},
 'Bandi di gara e contratti': {'href': 'http://www.comune.arezzo.it/bandi-gara-contratti-0'},
 'Sovvenzioni, contributi, sussidi, vantaggi economici': {'hr

In [41]:
amministrazione_test.get_subsections_files(ss_link='http://www.comune.arezzo.it/disposizioni-generali',
                                           dir_download=f'{download_path}/Disposizioni generali',
                                           files_in_db=[],
                                           files_section=['https://www.comune.arezzo.it/sites/default/files/gc_2022_316.pdf', 
                                                          'https://www.comune.arezzo.it/sites/default/files/allegato_a_piao_2022_2024.pdf'])

{'c00be297_gc_2022_316.pdf': {'link': 'https://www.comune.arezzo.it/sites/default/files/gc_2022_316.pdf', 'internal_path': WindowsPath('pt/Disposizioni generali/c00be297_gc_2022_316.pdf')}, '8feeea1d_allegato_a_piao_2022_2024.pdf': {'link': 'https://www.comune.arezzo.it/sites/default/files/allegato_a_piao_2022_2024.pdf', 'internal_path': WindowsPath('pt/Disposizioni generali/8feeea1d_allegato_a_piao_2022_2024.pdf')}}
{'e1b0cbc0_fileinnercontentproxy.2015-10-20.0338883112.pdf': {'link': 'https://www.comune.arezzo.it/sites/default/files/statuti_regolamenti/files/pdf/fileinnercontentproxy.2015-10-20.0338883112.pdf', 'internal_path': WindowsPath('pt/Disposizioni generali/e1b0cbc0_fileinnercontentproxy.2015-10-20.0338883112.pdf')}, 'bd9be40a_regolamento_compensazione_urbanistica.pdf': {'link': 'https://www.comune.arezzo.it/sites/default/files/regolamento_compensazione_urbanistica.pdf', 'internal_path': WindowsPath('pt/Disposizioni generali/bd9be40a_regolamento_compensazione_urbanistica.pdf'