In [None]:
from datetime import datetime,date
from functools import wraps
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException, StaleElementReferenceException, InvalidSessionIdException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, unquote

import csv
import hashlib
import logging
import os
import requests
import re
import time
import pdb
import threading

from pathlib import Path
from typing import Tuple, Union

In [None]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

In [None]:
comune = 'brindisi'
download_path = './pt'
url_comune = 'https://servizi.comune.brindisi.it/openweb/trasparenza/'

In [None]:
def retry(max_attempts=3, delay=10, exceptions=(Exception,)):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    attempts += 1
                    logging.info(f"Attempt {attempts} failed:", e)
                    time.sleep(delay)
            raise RuntimeError(f"Function {func.__name__} failed after {max_attempts} attempts")
        return wrapper
    return decorator

In [None]:
extensions = ('.pdf', '.odt', '.xls', '.csv', '.ods', '.xlsx', '.doc', '.zip', '.docx', '.sxc')

# Driver manager

In [None]:
class WebDriverManager:
    def __init__(self, DIR_DOWNLOAD):
        self.driver = None
        self.output = DIR_DOWNLOAD

    def start_driver(self, url):
        if not self.driver or (self.is_driver_open() == False):
            opts = Options()
            # start driver
            s = Service(r".\chromedriver.exe")
            prefs = {"download.default_directory": f'{self.output}',
                    "directory_upgrade": True,
                    "profile.default_content_settings.popups": 0}
            opts.add_experimental_option("prefs", prefs)
            self.driver = Chrome(service=s, options=opts)

            # opts = Options()
            # service = Service()

            # prefs = {"download.default_directory": f'{self.output}',
            #         "directory_upgrade": True,
            #         "profile.default_content_settings.popups": 0,
            #         "plugins.always_open_pdf_externally": True
            #         }

            # opts.add_experimental_option('prefs', prefs)
            # opts.add_argument('--no-sandbox')
            # opts.add_argument('--headless')
            # opts.add_argument('--disable-gpu')
            # opts.add_argument('--disable-dev-shm-usage')

            # self.driver = Chrome(service=service,
            #                 options=opts)

            self.driver.get(url)
            time.sleep(10) # Waiting for main page to load
        return self.driver  # Return the driver instance     

    def is_driver_open(self):
        if self.driver:
            try:
                # Access a property or method of the driver
                self.driver.current_url
                return True
            except NoSuchWindowException:
                return False
            except Exception as e:
                logging.info(f'Driver seems to be closed: {e}')
                return False
        return False
    
    def get_driver(self, link):
        if not self.is_driver_open():
            self.driver = self.start_driver(link)
        else: 
            self.driver.get(link)
        return self.driver

    def close_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

Initialize webdriver, and clicking on cookies button

In [None]:
driver_manager = WebDriverManager(download_path)
driver = driver_manager.get_driver(link=url_comune)

# Sections

In [None]:
def get_sections() -> dict:
    sections = {}
    driver.get(url_comune)
    body_e = driver.find_element(By.ID, 'lista_trasparenza_categorie')
    div_e = body_e.find_elements(By.XPATH, './div')

    for d_e in div_e:
        s_e = d_e.find_element(By.TAG_NAME, 'h3')
        s_name = s_e.text
        s_a_e = s_e.find_element(By.TAG_NAME, 'a')
        s_link = s_a_e.get_attribute('href')

        sections[s_name] = {'href': s_link}
    
    return sections

# Subsections

In [None]:
def get_subsections(s_link: str) -> dict:
    driver.get(s_link)
    subsections = {}
    try:
        body_e = driver.find_element(By.ID, 'lista_servizi_privati')
        div_e = body_e.find_elements(By.XPATH, './div')
        for d_e in div_e:
            a_e = d_e.find_element(By.TAG_NAME, 'a')
            ss_name = a_e.text
            ss_link = a_e.get_attribute('href')

            subsections[ss_name] = {'href': ss_link}
    except NoSuchElementException:
        pass
    return subsections

In [None]:
# for s, s_values in sections.items():
#     s_link = s_values['href']
#     print(f'{s}: {s_link}')
#     subsections = get_subsections(s_link=s_link)
#     if subsections:
#         for ss, ss_values in subsections.items():
#             ss_link = ss_values['href']
#             print(f'\t{ss}: {ss_link}')

# Microsections

In [None]:
def get_microsections(ss_link: str) -> Tuple[dict, None]:
    driver.get(ss_link)
    microsections = {}
    try:
        ms_e_list = driver.find_elements(By.CLASS_NAME, 'btn.btn-primary.my-3')
        for ms_e in ms_e_list:
            ms_name = ms_e.text
            ms_link = ms_e.get_attribute('href')

            microsections[ms_name] = {'href': ms_link}
    except NoSuchElementException:
        pass
    return microsections

In [None]:
# get_microsections(ss_link=ss_link)

# SectionsTableFileExtractor

In [None]:
class SectionsTableFileExtractor:
    def __init__(self, 
                 driver_manager, 
                 link_section:str, 
                 dir_download: str, 
                 last_date_update: datetime,
                 files_in_db: list[str]):
        self.driver             = driver_manager.get_driver(link=link_section)
        self.link_section       = link_section
        self.dir_download       = self.check_dir_download(dir_download)
        self.last_date_update   = last_date_update
        self.dict_links_oggetti = self.loop_all_pages()
        self.files_in_db        = files_in_db

    def check_dir_download(self, dir_download):
        # Creating directory if does not exist:
        if not os.path.exists(dir_download):
            os.makedirs(dir_download)
        return dir_download

    def get_n_atto(self, ogg_e: WebElement)->str:
        try:
            nr_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Nr. Atto:"]]')
            nr_atto_text = nr_atto_div.text.split('Nr. Atto:')[-1].strip()
            return nr_atto_text
        except NoSuchElementException:
            return 'Null'
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get n_atto: {e}')
    
    def get_start_date(self, ogg_e: WebElement)->str:
        try:
            data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data Atto:"]]')
            data_atto_text = data_atto_div.text.split('Data Atto:')[-1].strip()
            return data_atto_text
        except NoSuchElementException:
            try:
                data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data inizio:"]]')
                data_atto_text = data_atto_div.text.split('Data inizio:')[-1].strip()

                return data_atto_text
            except NoSuchElementException:
                return 'Null'
            except Exception as e:
                logging.warning(f'SectionsTable: Could not get start date: {e}')


    def get_end_date(self, ogg_e: WebElement)->str:
        try:
            data_atto_div = ogg_e.find_element(By.XPATH, './/div[label[text()="Data fine:"]]')
            data_atto_text = data_atto_div.text.split('Data fine:')[-1].strip()

            return data_atto_text
        except NoSuchElementException:
            return 'Null'
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get end date: {e}')

    
    @staticmethod
    def get_oggetto_link(ogg_e: WebElement)->str:
        try:
            a_element = ogg_e.find_element(By.TAG_NAME, 'a')
            return a_element.get_attribute('href')

        except Exception as e:
            logging.warning(f'SectionsTable: Could not get link from oggetto {e}')
        pass


    @staticmethod
    def rename_file(file_name:str, link_ogg: str)->str:
        # Generate a unique identifier for the file
        id = hashlib.sha256(link_ogg.encode()).hexdigest()[:8]

        return f'{id}_{file_name}'
    

    def get_main_info_oggetto(self, ogg_e: WebElement) -> dict:
        n_atto     = self.get_n_atto(ogg_e=ogg_e)
        start_date = self.get_start_date(ogg_e=ogg_e)
        end_date   = self.get_end_date(ogg_e=ogg_e)
        link       = self.get_oggetto_link(ogg_e=ogg_e)
        
        return {link: dict(n_atto=n_atto,
                           start_date=start_date,
                           end_date=end_date)}


    def get_atti_from_page(self) -> dict:
        try:
            all_oggetti_page = {}
            # Waiting for the tabella element
            body_e = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, 'table_delibere'))
            )

            # Getting each row, one for each atto
            rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

            # Process rows, one for atto
            if rows_e:
                for r_e in rows_e:
                    try:
                        start_date = self.get_start_date(r_e)
                        if (self.last_date_update == None) or (self.last_date_update < start_date.date()):
                            dict_ogg = self.get_main_info_oggetto(ogg_e=r_e)

                            all_oggetti_page.update(dict_ogg)
                        else:  # atto already in DB, skipping it
                            pass
                    except StaleElementReferenceException:  # Skip this row if it became stale
                        continue
            else:
                pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not get the atti from this page: {e}')

        return all_oggetti_page


    def loop_all_pages(self) -> dict:
        all_oggetti_dict = {}
        self.driver.get(self.link_section)

        page_number = 1
        while True:
            try:
                logging.info(f'Getting atti from page: {page_number}')
                dict_page = self.get_atti_from_page()
                all_oggetti_dict.update(dict_page)

                # Finding next page button
                next_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[@class='button' and @title='Successiva']"))
                )

                # Use JavaScript to click
                self.driver.execute_script("arguments[0].click();", next_button)
                page_number += 1

                # Waiting for the new content to load
                time.sleep(2)

            except Exception as e:
                # print("No more pages or an error occurred:", e)
                break
        return all_oggetti_dict
    

    @staticmethod
    def get_filename_from_cd(cd):
        if not cd:
            return None
        fname = None
        if 'filename=' in cd:
            fname = cd.split('filename=')[1]
            if '"' in fname:
                fname = fname.split('"')[1]
            else:
                fname = fname.split(';')[0].strip()
        return fname
    

    def general_download(self, link_file:str)->Path:
        try: 
            response = requests.get(link_file)

            cd = response.headers.get('Content-Disposition')
            b_name = self.get_filename_from_cd(cd)
            if not b_name:
                b_name = link_file.split('/')[-1]
                if 'getDoc.php?f=' in b_name:
                    b_name = b_name.replace('getDoc.php?f=', '')
            file_name = self.rename_file(b_name, link_file)

            # Construct the full path of the file
            file_path = Path(self.dir_download, file_name)
            
            with open(file_path, 'wb') as file:
                file.write(response.content)
            
            return file_name, link_file, file_path
        except Exception as e:
            logging.info(f'SectionsTable: Could not download file from {link_file}: {e}')
    

    def download_single_file_external_page(self, link_file_external: str) -> None:
        try:
            self.driver.get(link_file_external)
            scarica_button = self.driver.find_element(By.CLASS_NAME, 'btn.btn-primary')
        except NoSuchElementException:
            try:
                scarica_button = self.driver.find_element(By.CLASS_NAME, 'btn.btn-secondary')
            except Exception as e:
                logging.info(f'SectionsTable: No button found in external link: {e}')

        if scarica_button: 
            try:
                link_full = scarica_button.get_attribute('href')
                link_file = link_full.split('&CSRF')[-2]

                # This is a PDF file, download it
                f_name, f_link, f_path = self.general_download(link_file=link_file)
                return f_name, f_link, f_path
            except Exception as e:
                logging.info(f'SectionsTable: Could not get file from external link: {e}')


    def download_single_file_atto_page(self, link_ogg: str) -> Tuple[str, str, str]:
        """This method will get all the links available for a given atto, it will use the HEAD request 
        to check if it's possible to download a file from the given file, otherwise it probably it's an 
        external link, in this case it will open the external link and download the files from it.

        Args:
            link_ogg (str): Link of the atto.

        Returns:
            Tuple[str, str, str]: Name of the file, Link of the file, Path where it's been saved.
        """
        try:
            # Send a HEAD request to check the content type
            response = requests.head(link_ogg, allow_redirects=True)
            
            if 'Content-Type' in response.headers: 
                content_type = response.headers['Content-Type']
                # If the link has the file, download it
                if ('application/pdf' in content_type) or ('application/x-7z-compressed' in content_type):
                    f_name, f_link, f_path = self.general_download(link_file=link_ogg)
                # Else maybe it's an external link, try to download it:
                else:
                    f_name, f_link, f_path = self.download_single_file_external_page(link_file_external=link_ogg)
                return f_name, f_link, f_path
            else:
                pass
        except Exception as e:
            logging.warning(f'SectionsTable: Could not download files from {link_ogg}: {e}')
            return None, None, None

    @retry(max_attempts=5, delay=10, exceptions=(StaleElementReferenceException, InvalidSessionIdException))
    def get_single_object(self, link_ogg: str, dict_info: dict, dict_ogg: dict):
        self.driver.get(link_ogg)
        try:
            body_e = self.driver.find_element(By.CLASS_NAME, 'card-body.pe-0.ps-4.ps-md-5')
        except NoSuchElementException:
            try:
                body_e = self.driver.find_element(By.ID, 'allegati')
            except:
                logging.warning(f'SectionsTable: Could not get allegati from {link_ogg}: {e}')
        if body_e:
            try:
                list_link_e = body_e.find_elements(By.TAG_NAME, 'a')
                list_link = [l_e.get_attribute('href') for l_e in list_link_e]

                for l in list_link:
                    f_name, f_link, f_path = self.download_single_file_atto_page(l)
                    if f_name and f_link and f_path and (f_name not in self.files_in_db):
                        dict_ogg[f_name] = dict(**dict_info,
                                                file_name     = f_name,
                                                link          = f_link,
                                                internal_path = f_path)
            except Exception as e:
                logging.warning(f'SectionsTable: Could not download files: {e}')


    def get_files_no_object(self, dict_ogg):
        body_e = self.driver.find_element(By.ID, 'sezioni-automatiche')
        list_link_e = body_e.find_elements(By.TAG_NAME, 'a')
        list_link = [l_e.get_attribute('href') for l_e in list_link_e]

        for l in list_link:
            f_name, f_link, f_path = self.download_single_file_atto_page(l)
            if f_name and f_link and f_path and (f_name not in self.files_in_db):
                print(f_name, f_link, f_path)
                dict_ogg[f_name] = dict(file_name     = f_name,
                                        link          = f_link,
                                        internal_path = f_path)

    
    def get_dict_objects(self):
        dict_ogg = {}
        total_oggetti = len(self.dict_links_oggetti)
        logging.info(f'Found {total_oggetti} oggetti to process')

        if self.dict_links_oggetti:
            for index, (link, dict_info) in enumerate(self.dict_links_oggetti.items()):
                # Log progress every 20 items
                if index % 20 == 0 and index > 0:
                    logging.info(f'Processing oggetto {index} of {total_oggetti}')
                try:
                    self.get_single_object(link_ogg  = link,
                                           dict_info = dict_info,
                                           dict_ogg  = dict_ogg)
                except:
                    pass

            logging.info(f'Completed processing all {total_oggetti} oggetti')
        else:
            try:
                self.get_files_no_object(dict_ogg = dict_ogg)
            except Exception as e:
                print(e)
                logging.info(f'SectionsTable: All files up-to-date!')
        return dict_ogg

## Test Personale Section

In [None]:
# sections_test = SectionsTableFileExtractor(driver_manager=driver_manager,
#                            link_section='https://servizi.comune.brindisi.it/openweb/trasparenza/pagina.php?id=18',
#                            dir_download=f'{download_path}/Personale',
#                            last_date_update=None,
#                            files_in_db=[])

In [None]:
# sections_test.dict_links_oggetti

In [None]:
# dict_download = sections_test.get_dict_objects()

## Test Provvedimenti

In [None]:
# sections_test = SectionsTableFileExtractor(driver_manager=driver_manager,
#                            link_section='https://servizi.comune.brindisi.it/openweb/pratiche/registri.php?sezione=provvOrgani',
#                            dir_download=f'{download_path}/Provvedimenti',
#                            last_date_update=None,
#                            files_in_db=[])

In [None]:
# sections_test.dict_links_oggetti

In [None]:
# dict_download = sections_test.get_dict_objects()

# Scrapper general

In [None]:
class AmministrazioneTrasparenteScrapper:
    def __init__(self, 
                 driver_manager: WebDriverManager, 
                 postgres_manager, 
                 minio_manager, 
                 url_comune:str,
                 dir_output:str, 
                 ) -> None:
        self.driver_manager   = driver_manager
        self.driver           = driver_manager.get_driver(link=url_comune)
        self.postgres_manager = postgres_manager
        self.minio_manager    = minio_manager
        self.url_comune       = url_comune
        self.dir_download     = dir_output
        self.sections         = self.get_sections()

    def close_all_connections(self):
        logging.info('Closing all connections')
        self.driver_manager.close_driver()
        self.postgres_manager.close_connection()
        self.minio_manager.close_connection()
    
    def get_sections(self) -> dict:
        sections = {}
        self.driver.get(url_comune)
        body_e = self.driver.find_element(By.ID, 'lista_trasparenza_categorie')
        div_e = body_e.find_elements(By.XPATH, './div')

        for d_e in div_e:
            s_e    = d_e.find_element(By.TAG_NAME, 'h3')
            s_name = s_e.text
            s_a_e  = s_e.find_element(By.TAG_NAME, 'a')
            s_link = s_a_e.get_attribute('href')

            sections[s_name] = {'href': s_link}
        return sections
    
    def get_subsections(self, s_link: str) -> dict:
        self.driver.get(s_link)
        subsections = {}
        try: 
            body_e = self.driver.find_element(By.ID, 'lista_servizi_privati')
            div_e = body_e.find_elements(By.XPATH, './div')
            for d_e in div_e:
                a_e = d_e.find_element(By.TAG_NAME, 'a')
                ss_name = a_e.text
                ss_link = a_e.get_attribute('href')

                subsections[ss_name] = {'href': ss_link}
        except NoSuchElementException:
            pass
        return subsections
    
    def get_microsections(self, ss_link: str) -> Tuple[dict, None]:
        self.driver.get(ss_link)
        microsections = {}
        try:
            ms_e_list = driver.find_elements(By.CLASS_NAME, 'btn.btn-primary.my-3')
            for ms_e in ms_e_list:
                ms_name = ms_e.text
                ms_link = ms_e.get_attribute('href')

                microsections[ms_name] = {'href': ms_link}
        except NoSuchElementException:
            pass
        return microsections
    
    def get_files_microsection(self, s: str, s_link: str, ss: str = None, ss_link: str = None):
        # struttuta_t = StrutturaTable(postgres_manager = self.postgres_manager,
                                    #  sottosezione_lv1 = s, 
                                    #  sottosezione_lv2 = ss)
        # id_struttura = struttuta_t.get_id_struttura
        # files_in_db = DocumentiTable(postgres_manager = self.postgres_manager, 
        #                              id_struttura     = id_struttura).get_files_already_in_db()
        # last_date_update = struttuta_t.check_last_date_document()

        last_date_update = None #TODO
        files_in_db = None #TODO
        if not ss:
            dir_download = f'{self.dir_download}/{s}'
            link_section = s_link
        else:
            dir_download = f'{self.dir_download}/{s}/{ss}'
            link_section = ss_link
        

        microsections = self.get_microsections(ss_link = ss_link)
        time.sleep(5)
        if microsections:
            for ms_values in microsections.values():
                ms_link = ms_values['href']
                self.driver.get(ms_link)

                try:
                    dict_oggetti = SectionsTableFileExtractor(driver_manager   = driver_manager,
                                                              link_section     = ms_link,
                                                              dir_download     = dir_download,
                                                              last_date_update = last_date_update,
                                                              files_in_db      = files_in_db).get_dict_objects()
                    
                except Exception as e:
                    logging.warning(f'Could not download files for microsection: {ms_link}: {e}')
        else:
            print(f'link_section: {link_section}')
            try:
                dict_oggetti = SectionsTableFileExtractor(driver_manager   = driver_manager,
                                                          link_section     = link_section,
                                                          dir_download     = dir_download,
                                                          last_date_update = last_date_update,
                                                          files_in_db      = files_in_db).get_dict_objects()
        
            except Exception as e:
                logging.warning(f'Could not download files for section: {s}: {e}')

        print(dict_oggetti)
        # WriteFiles(id_struttura     = id_struttura,
        #            dict_files       = dict_oggetti,
        #            dir_download     = dir_download,
        #            postgres_manager = self.postgres_manager,
        #            minio_manager    = self.minio_manager).write_all_documents_to_db()
            
            

    def get_all_atti(self):
        for s, s_values in self.sections.items():
            s_link = s_values['href']
            logging.info(f'Section: {s}')
            subsections = self.get_subsections(s_link = s_link)
            if subsections:
                for ss, ss_values in subsections.items():
                    logging.info(f'\tSubsection: {s}')
                    ss_link = ss_values['href']

                    self.get_files_microsection(s       = s,
                                                s_link  = s_link,
                                                ss      = ss,
                                                ss_link = ss_link)
            else:
                self.get_files_microsection(s      = s,
                                            s_link = s_link)
            
        # self.close_all_connections()

In [None]:
test_pt = AmministrazioneTrasparenteScrapper(driver_manager=driver_manager,
                                             postgres_manager=None,
                                             minio_manager=None,
                                             url_comune=url_comune,
                                             dir_output=download_path)

In [None]:
test_pt.get_sections()

In [None]:
test_pt.get_files_microsection(s       =  None,
                               s_link  = 'https://servizi.comune.brindisi.it/openweb/trasparenza/categoria.php?id=12',
                               ss      = 'Titolari di incarichi dirigenziali amministrativi di vertice',
                               ss_link = 'https://servizi.comune.brindisi.it/openweb/trasparenza/pagina.php?id=18') 