In [37]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

def setup_driver(download_folder, url, starting_page=1):
    # Set Chrome options to download files automatically
    chrome_options = webdriver.ChromeOptions()
    prefs = {
        "download.default_directory": download_folder,  # Default download directory
        "plugins.always_open_pdf_externally": True,  # Disable PDF viewer
        "safebrowsing.enabled": "false", 
        "safebrowsing.disable_download_protection": True 
    }
    chrome_options.add_experimental_option("prefs", prefs)
    
    # Add arguments to allow insecure downloads
    chrome_options.add_argument('--allow-insecure-localhost')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--safebrowsing-disable-download-protection')  # Disable blocking of insecure downloads
    
    # New arguments to allow mixed content and disable SSL certificate validation
    chrome_options.add_argument('--allow-running-insecure-content')  # Allow loading insecure content
    chrome_options.add_argument('--ignore-certificate-errors')  # Ignore SSL certificate errors
    chrome_options.add_argument('--disable-web-security')  # Disable web security features
    chrome_options.add_argument(f"--unsafely-treat-insecure-origin-as-secure={url}")
    
    # Set up the driver with ChromeDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(url)
    input_element = driver.find_element(By.ID, "formResumen:dataTableResumen:j_idt20:filter")
    input_element.send_keys("Acta")

    return driver

def create_info_dict(driver):
    info_dict = {}
    rows = driver.find_elements(By.XPATH, "//tbody[@id='formResumen:dataTableResumen_data']/tr")
    
    for row in rows:
        try:
            raw_id = row.find_element(By.XPATH, ".//label[contains(@id, 'j_idt12')]").text
            chamber = row.find_element(By.XPATH, ".//label[contains(@id, 'j_idt17')]").text
            date = row.find_element(By.XPATH, ".//label[contains(@id, 'j_idt19')]").text
            base_id = f"gaceta_{raw_id}"
            unique_id = base_id
            counter = 1
            while unique_id in info_dict:
                unique_id = f"{base_id} ({counter})"
                counter += 1
            info_dict[unique_id] = (date, chamber) 
        except:
            pass
    return info_dict

def get_page(driver, starting_page=1):
    if starting_page < 10:
        starting_page = f"0{str(starting_page)}"
    page_finder = driver.find_element(By.ID, "formResumen:dataTableResumen:j_idt11:filter")
    page_finder.clear()
    page_finder.send_keys(starting_page)

def is_last_page(driver):
    try:
        driver.find_element(By.XPATH, "//span[contains(@class, 'ui-paginator-next') and contains(@class, 'ui-state-disabled')]")
        return True
    except NoSuchElementException:
        return False


In [43]:
data_dict = {}

In [44]:
download_folder = os.path.join(os.getcwd(), "downloads")
url = "http://svrpubindc.imprenta.gov.co/senado/"
starting_page = 1
final_page = 1835
driver = setup_driver(download_folder, url, 1)
for i in range(starting_page, final_page):
    get_page(driver, i)
    if len(data_dict.keys()) == 0:
        data_dict = create_info_dict(driver)
    else:
        data_dict.update(create_info_dict(driver))
        time.sleep(3)
driver.quit()

In [51]:
data_df = pd.DataFrame(data_dict).T
data_df.columns = ["date", "chamber"]
data_df['chamber'] = data_df['chamber'].apply(lambda x: "house" if "Cámara" in x else "senate")
data_df.head()

Unnamed: 0,date,chamber
gaceta_2055,27/11/2024,senate
gaceta_2054,27/11/2024,senate
gaceta_2051,27/11/2024,senate
gaceta_2050,27/11/2024,house
gaceta_2049,26/11/2024,senate


In [53]:
data_df.to_csv(r"C:\Users\asarr\Documents\MACSS\Thesis\results\gacetas_info.csv")