## Tasks
1. Extract session info (date, chamber, type of sessions (plenary/comission, etc.))
3. extract participating senators (can be used to build list of senators)
3. Cleaning of the text and organizing into speakers

# 1. Packages and helpers

In [29]:
import pandas as pd
import os
import re
import datetime
import pymupdf

In [47]:
def extract_text_with_bold(filename):
    doc = pymupdf.open(filename)
    extracted_text = ""

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        # Check for bold text and annotate it
                        if span["font"].lower().find("bold") >= 0:
                            extracted_text += f"*b*{text}*b* "  # Annotate bold text
                        else:
                            extracted_text += text + " "
                    extracted_text += "\n"
            extracted_text += "\n"

    doc.close()
    return extracted_text


def clean_raw_text(text):
    """ 
    Clean the text of a PDF file by removing unnecessary metadata and repeated words. #TODO: detect bold and create "script
    """
    # Remove unnecessary metadata, like headers and footers
    clean_text = re.sub(r"IMPRENTA\s*NACIONAL\s*DE\s*COLOMBIA.*|www\.\w+\.gov\.co", "", text, flags=re.IGNORECASE)
    clean_text = re.sub(r"Página\s*\d+|Edición\s*de.*páginas", "", clean_text, flags=re.IGNORECASE)
    
    # Remove unnecessary repeated words (like "Año XXXIII - Nº 1114")
    clean_text = re.sub(r"Año.*Nº.*", "", clean_text, flags=re.IGNORECASE)

    # Remove rows that contain only uppercase letters and spaces
    clean_text = "\n".join([line for line in clean_text.splitlines() if re.search(r'[a-z]', line)])
    
    # Remove any extra blank lines
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    return clean_text.strip()


def extract_session_info(raw_text):
    """
    Extracts the session information from the text of a Gaceta del Congreso
    """
    #date
    months_spanish = ["enero", "febrero", "marzo", "abril", "mayo", "junio", 
    "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"]


    date_pattern = r"(\d{1,2})\s+de\s+(" + "|".join(months_spanish) + r")\s+de\s+(\d{4})"
    match = re.search(date_pattern, raw_text, re.IGNORECASE)
    if match:
        date = datetime.date(int(match.group(3)), months_spanish.index(match.group(2)) + 1, int(match.group(1)))

    #chamber and instance
    header = re.sub(r"\s", "", raw_text[:1000]).upper()

    chamber = "house" if "CÁMARADEREPRESENTES" in header else "senate"

    instance = "commitee" if "COMISIÓN" in header or "COMISION" in header else "plenary"

    return date, chamber, instance 

def process_pdf(file_path): 
    raw_text = extract_text_with_bold(file_path)
    clean_text = clean_raw_text(raw_text)
    info = extract_session_info(clean_text) 
    gaceta_id = os.path.basename(file_path)[:-4]

    return pd.DataFrame({
        "id": gaceta_id, 
        "date": info[0], 
        "chamber": info[1], 
        "type": info[2], 
        "raw_text": raw_text,
        "clean_text": clean_text
    }, index=[0])

## 2.1 test variables

In [50]:
folder = r"C:\Users\asarr\Documents\MACSS\Thesis\data\test"

test_df = pd.DataFrame(columns=["id", "date", "chamber", "type", "raw_text", "clean_text"])

for file_name in os.listdir(folder):
    file_path = os.path.join(folder, file_name)
    new_row = process_pdf(file_path)
    test_df = pd.concat([test_df, new_row], ignore_index=True)

test_df.head()


Unnamed: 0,id,date,chamber,type,raw_text,clean_text
0,gaceta_1114,2024-08-06,senate,plenary,A C TA S D E C O M I S I Ó N \n\nDIRECTORES:...,"(Artículo 36, Ley 5ª de 1992) \n Bogotá, D..."
1,gaceta_1115,2024-08-06,senate,plenary,A C TA S D E C O M I S I O N \n\nDIRECTORES:...,"(Artículo 36, Ley 5ª de 1992) \n Bogotá, D..."
2,gaceta_1124,2024-06-14,senate,commitee,COMISIÓN QUINTA CONSTITUCIONAL \n\nPERMANENTE...,*b*ACTA NÚMERO 044 DE 2024*b* \n(junio 14) \nL...
3,gaceta_1139,2024-08-15,senate,commitee,"DIRECTORES: \t \n\n(Artículo 36, Ley 5ª de 19...","(Artículo 36, Ley 5ª de 1992) \n Bogotá, D..."
4,gaceta_1140,2024-08-15,senate,commitee,"DIRECTORES: \t \n\n(Artículo 36, Ley 5ª de 19...","(Artículo 36, Ley 5ª de 1992) \n Bogotá, D..."


In [None]:
#TODO: fiz¿x extract_session_info to detect chamber and instance