In [46]:
from datetime import datetime,date
from functools import wraps
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException, ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, unquote

import csv
import hashlib
import logging
import os
import requests
import re
import time

from pathlib import Path
from typing import Tuple

In [33]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

In [34]:
comune = 'brindisi'
download_path = './ap'
url_comune = 'https://servizi.comune.brindisi.it/openweb/albo/albo_pretorio.php'

In [35]:
def retry(max_attempts=3, delay=10, exceptions=(Exception,)):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    attempts += 1
                    logging.info(f"Attempt {attempts} failed:", e)
                    time.sleep(delay)
            raise RuntimeError(f"Function {func.__name__} failed after {max_attempts} attempts")
        return wrapper
    return decorator

# Driver manager

In [36]:
class WebDriverManager:
    def __init__(self, DIR_DOWNLOAD):
        self.driver = None

    def start_driver(self, url):
        if not self.driver or (self.is_driver_open() == False):
            opts = Options()
            # start driver
            s = Service(r".\chromedriver.exe")
            prefs = {"download.default_directory": f'{download_path}',
                    "directory_upgrade": True,
                    "profile.default_content_settings.popups": 0}
            opts.add_experimental_option("prefs", prefs)
            self.driver = Chrome(service=s, options=opts)

            # opts = Options()
            # service = Service()

            # prefs = {"download.default_directory": f'{download_path}',
            #         "directory_upgrade": True,
            #         "profile.default_content_settings.popups": 0,
            #         "plugins.always_open_pdf_externally": True  # It will not show PDF directly in chrome
            #         }

            # opts.add_experimental_option('prefs', prefs)
            # opts.add_argument('--no-sandbox')
            # opts.add_argument('--headless')
            # opts.add_argument('--disable-gpu')
            # opts.add_argument('--disable-dev-shm-usage')

            # self.driver = Chrome(service=service,
            #                 options=opts)

            self.driver.get(url)
            # self.check_cookies()
            time.sleep(10) # Waiting for main page to load
        return self.driver  # Return the driver instance
        
    # No cookies button (?)
    # def check_cookies(self):
    #     try: 
    #         time.sleep(2)
    #         cookie_button = self.driver.find_element(By.CLASS_NAME, "btn.btn-primary.mr-2")

    #         cookie_button.click()
    #     except NoSuchElementException:
    #         logging.info(f'Cookies button was not found')
    #         pass
    #     except ElementNotInteractableException:
    #         pass

    def is_driver_open(self):
        if self.driver:
            try:
                # Access a property or method of the driver
                self.driver.current_url
                return True
            except NoSuchWindowException:
                return False
            except Exception as e:
                logging.info(f'Driver seems to be closed: {e}')
                return False
        return False
    
    def get_driver(self, link):
        if not self.is_driver_open():
            logging.warning(f'Driver is not responding, reopening')
            self.driver = self.start_driver(link)
        # else: 
        #     self.driver.get(link)
            # self.check_cookies()

        return self.driver

    def close_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

In [37]:
driver_manager = WebDriverManager(download_path)

In [38]:
driver = driver_manager.get_driver(url_comune)



# Loop through all pages of a section

In [39]:
body_e = driver.find_element(By.ID, 'tabella_albo')

rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

In [40]:
for r_e in rows_e:
    # Find the div containing 'Data affissione:'
    data_affissione_div = r_e.find_element(By.XPATH, ".//div[label/text()='Data affissione:']")
    # data_affissione_text = data_affissione_div.text.split("Data affissione:")[1].strip()

    # Find the div containing 'Fine Pubblicazione:'
    fine_pubblicazione_div = r_e.find_element(By.XPATH, ".//div[label/text()='Fine Pubblicazione:']")
    # fine_pubblicazione_text = fine_pubblicazione_div.text.split("Fine Pubblicazione:")[1].strip()

    print(data_affissione_div.text, fine_pubblicazione_div.text)
    

05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 06/06/2024
05/06/2024 05/07/2024
05/06/2024 20/06/2024
05/06/2024 20/06/2024
05/06/2024 20/06/2024


In [48]:
while True:
    try:
        # Wait until the table body is present
        body_e = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'tabella_albo'))
        )

        # Locate rows in the table
        rows_e = body_e.find_elements(By.CLASS_NAME, 'paginated_element')

        # Process rows
        if rows_e:
            for r_e in rows_e:
                try:
                    progressivo = r_e.find_element(By.TAG_NAME, 'div').text
                    print(progressivo)
                except StaleElementReferenceException:
                    # Skip this row if it became stale
                    continue
        else:
            break

        # Finding next page button
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@class='button' and @title='Successiva']"))
        )

        # Use JavaScript to click
        driver.execute_script("arguments[0].click();", next_button)

        # Waiting for the new content to load
        time.sleep(2)

    except Exception as e:
        print("No more pages or an error occurred:", e)
        break


2024/0002730
2024/0002729
2024/0002728
2024/0002727
2024/0002726
2024/0002725
2024/0002724
2024/0002723
2024/0002722
2024/0002721
2024/0002720
2024/0002719
2024/0002718
2024/0002717
2024/0002716
2024/0002715
2024/0002714
2024/0002713
2024/0002712
2024/0002711
2024/0002710
2024/0002709
2024/0002708
2024/0002707
2024/0002706
2024/0002705
2024/0002704
2024/0002703
2024/0002702
2024/0002701
2024/0002700
2024/0002699
2024/0002698
2024/0002697
2024/0002696
2024/0002695
2024/0002694
2024/0002693
2024/0002692
2024/0002691
2024/0002690
2024/0002689
2024/0002688
2024/0002687
2024/0002686
2024/0002685
2024/0002684
2024/0002683
2024/0002682
2024/0002681
2024/0002680
2024/0002679
2024/0002678
2024/0002677
2024/0002676
2024/0002675
2024/0002674
2024/0002673
2024/0002672
2024/0002671
2024/0002670
2024/0002669
2024/0002668
2024/0002667
2024/0002666
2024/0002665
2024/0002664
2024/0002663
2024/0002662
2024/0002661
2024/0002660
2024/0002659
2024/0002658
2024/0002657
2024/0002656
2024/0002655
2024/0002654

In [None]:
# selector for page number
page_s_e = driver.find_element(By.ID, 'range_pagina')

pages_e = page_s_e.find_elements(By.TAG_NAME, 'option')

pages = [p.get_attribute('value') for p in pages_e if p.get_attribute('value') != '']

for p in pages:
    print(f'Getting elements: {p}')

    page_s_e = driver.find_element(By.ID, 'range_pagina')

    page_select = Select(page_s_e)
    
    page_select.select_by_value(p)

    time.sleep(2)


# Getting oggetti and info

In [None]:
body_e = driver.find_element(By.CLASS_NAME, 'it-list-wrapper')

oggetti_e = body_e.find_elements(By.XPATH, './ul/li')

In [None]:
len(oggetti_e)

In [None]:
for o_e in oggetti_e:
    print(F'NEW_OGGETTO')
    ul_e = o_e.find_element(By.CLASS_NAME, 'link-sublist')
    print(ul_e.text)

In [None]:
info_e = driver.find_element(By.CLASS_NAME, 'link-sublist')
print(info_e.text)

# Regular expression pattern to match the date pattern
date_pattern = r"In pubblicazione dal (\d{2}/\d{2}/\d{4}) al (\d{2}/\d{2}/\d{4})"

# Find all date patterns in the info
date_matches = re.search(date_pattern, info_e.text)

if date_matches:
    start_date = datetime.strptime(date_matches.group(1), '%d/%m/%Y')
    end_date = datetime.strptime(date_matches.group(2), '%d/%m/%Y')
else:
    print('No dates were found')

In [None]:
start_date, end_date

In [None]:
atto_pattern = r"Registro: 2024/(\d+)"

# Find matches in the string
match = re.search(atto_pattern, info_e.text)
print(match.group(1))

# Downloading files

In [None]:
body_e = driver.find_element(By.CLASS_NAME, 'documento')
print(body_e.text)

In [None]:
a_elements = body_e.find_elements(By.TAG_NAME, 'a')

In [None]:
for a_e in a_elements:
    sibling_text = driver.execute_script("""
    var a = arguments[0];
    var text = a.nextSibling.nodeValue;
    return text;
    """, a_e).strip()

    print(sibling_text)

# Class oggetti Ricerca

In [None]:
class SectionsAlbo:
    def __init__(self, driver_manager: WebDriverManager, section_otion: str, dir_download: str, last_date_update: datetime):
        self.driver_manager = driver_manager
        self.driver = self.get_section(section_option=section_otion)
        self.dir_download = self.check_dir_download(dir_download)
        self.last_date_update = last_date_update
        self.dict_links_oggetti = self.loop_all_pages()

    def get_section(self, section_option: str):
        driver = self.driver_manager.get_driver(url_comune)

        menu_e = driver.find_element(By.ID, 'tendinaTipoPratiche')
        select_menu = Select(menu_e)

        select_menu.select_by_value(section_option)

        button_cerca = driver.find_element(By.ID, 'inforBottoneConfermaAlbo')
        button_cerca.click()
        return driver
        
    def check_dir_download(self, dir_download):
        # Creating directory if does not exist:
        if not os.path.exists(dir_download):
            os.makedirs(dir_download)
        return dir_download

    def get_n_atto(self, info_atto: str)->str:
        try:
            atto_pattern = r"Registro: 2024/(\d+)"

            # Find matches in the string
            match = re.search(atto_pattern, info_atto)
            return match.group(1)
        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get n_atto: {e}')
    
    def get_publish_dates(self, info_atto: str)->Tuple[datetime, datetime]:
        try:
            # Regular expression pattern to match the date pattern
            date_pattern = r"In pubblicazione dal (\d{2}/\d{2}/\d{4}) al (\d{2}/\d{2}/\d{4})"

            # Find all date patterns in the info
            date_matches = re.search(date_pattern, info_atto)

            start_date = datetime.strptime(date_matches.group(1), '%d/%m/%Y')
            end_date = datetime.strptime(date_matches.group(2), '%d/%m/%Y')

            return start_date, end_date
        except Exception as e:
            logging.warning(f'SectionsAlbo: It was not possible to get publish dates: {e}')
            return None, None
    
    def get_oggetto_link(self, ogg_e: WebElement)->str:
        try:
            a_element = ogg_e.find_element(By.TAG_NAME, 'a')
            return a_element.get_attribute('href')

        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get link from oggetto {e}')
        pass

    @staticmethod
    def rename_file(file_name:str, link_ogg: str)->str:
        # Generate a unique identifier for the file
        id = hashlib.sha256(link_ogg.encode()).hexdigest()[:8]

        return f'{id}_{file_name}'

    def download_file(self, file_name:str, link_ogg:str)->Path:
        try: 
            # Download the file
            response = requests.get(link_ogg)

            # Construct the full path of the file
            file_path = Path(self.dir_download, file_name)

            # Save the file
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            return file_path
        except Exception as e:
            logging.warning(f'SectionsAlbo: It was not possible to download file: {e}')  
        pass

    def get_atti_from_page(self) -> dict:
        try:
            all_oggetti_section = {}
            body_e = self.driver.find_element(By.CLASS_NAME, 'it-list-wrapper')

            oggetti_e  = body_e.find_elements(By.XPATH, './ul/li')

            for ogg in oggetti_e:
                info_atto = ogg.find_element(By.CLASS_NAME, 'link-sublist').text
                start_date, _ = self.get_publish_dates(info_atto=info_atto)
                if (self.last_date_update == None) or (self.last_date_update < start_date):
                    dict_ogg = self.get_main_info_oggetto(oggetto=ogg, 
                                                          info_atto=info_atto)

                    all_oggetti_section.update(dict_ogg)
            
            return all_oggetti_section
        except Exception as e:
            logging.warning(f'SectionsAlbo: Could not get the atti from this page: {e}')

    def loop_all_pages(self) -> dict:
        try: 
            all_oggetti_dict = {}
            # selector for page number to get all options
            page_s_e = self.driver.find_element(By.ID, 'range_pagina')

            pages_e = page_s_e.find_elements(By.TAG_NAME, 'option')

            pages = [p.get_attribute('value') for p in pages_e if p.get_attribute('value') != '']

            for p in pages:
                # Finding page selector in the given page
                page_s_e = driver.find_element(By.ID, 'range_pagina')

                # Using selenium create a select menu with each element
                page_select = Select(page_s_e)
                
                # Selecting each page
                page_select.select_by_value(p)

                time.sleep(2)

                dict_section = self.get_atti_from_page()
                all_oggetti_dict.update(dict_section)

            return all_oggetti_dict

        except NoSuchElementException:
            pass
            return {}
        except Exception as e:
            logging.warning(f'OggetiRicercaAlbo: Could not get all page numbers: {e}')
            return {}

    def get_main_info_oggetto(self, oggetto: WebElement, info_atto: str) -> dict:
        n_atto = self.get_n_atto(info_atto=info_atto)
        start_date, end_date = self.get_publish_dates(info_atto=info_atto)
        link = self.get_oggetto_link(ogg_e=oggetto)
        
        return {link: dict(n_atto=n_atto,
                           start_date=start_date,
                           end_date=end_date)}

    def get_dict_objects(self):
        dict_ogg = {}
        logging.info(f'Found {len(self.dict_links_oggetti)} oggetti')
        for link, dict_info in self.dict_links_oggetti.items():
            self.driver.get(link)

            body_e = self.driver.find_element(By.CLASS_NAME, 'documento')
            files_e = body_e.find_elements(By.TAG_NAME, 'a')

            for f_e in files_e:
                # Getting the sibling text where we can find the file name
                sibling_text = driver.execute_script("""
                                                     var a = arguments[0];
                                                     var text = a.nextSibling.nodeValue;
                                                     return text;
                                                     """, f_e).strip()

                pattern_f_name = r'- (.*?) -'

                # Find the file name
                match = re.search(pattern_f_name, sibling_text)
                
                f_name = match.group(1)
                f_link = f_e.get_attribute('href')

                file_name = self.rename_file(f_name, f_link)

                # Update dict_info with new keys and values
                updated_dict_info = dict_info.copy()
                updated_dict_info.update(dict(file_name=file_name,
                                              link=f_link,
                                              internal_path=self.download_file(file_name=file_name, link_ogg=f_link)))

                dict_ogg[f_name] = updated_dict_info

        return dict_ogg


In [None]:
sections_manager = SectionsAlbo(driver_manager=driver_manager,
                                section_otion='MAT01',
                                dir_download=download_path,
                                last_date_update=None)

In [None]:
sections_manager.dict_links_oggetti

In [None]:
sections_manager.get_dict_objects()

In [None]:
def get_all_sections(driver_manager: WebDriverManager,
                     url_comune: str) -> dict:
    
    driver = driver_manager.get_driver(url_comune)

    # find select menu
    select_menu_e = driver.find_element(By.ID, 'tendinaTipoPratiche')

    options_e = select_menu_e.find_elements(By.TAG_NAME, 'option')

    options = [o.get_attribute('value') for o in options_e if o.get_attribute('value') != '']
    sections_names = [o.text.strip() for o in options_e if o.text.strip() != 'Tutti']

    return dict(map(lambda i,j : (i,j) , sections_names,options))
    

In [None]:
sections = get_all_sections(driver_manager=driver_manager,
                            url_comune=url_comune)

In [None]:
sections

In [None]:
for s, opt in sections.items():
    print(s)
    dir_download_section = f'{download_path}/{s}'
    SectionsAlbo(driver_manager=driver_manager,
                 section_otion=opt,
                 dir_download=dir_download_section,
                 last_date_update=None).get_dict_objects()