In [97]:
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np
from PIL import Image
from pathlib import Path
import re
import json

In [17]:
# (Opcional) Define ruta a Tesseract si no está en el PATH
pytesseract.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"

pdf_path = Path("../data/test_votes.pdf")

doc = fitz.open(pdf_path)

pages = []

# for everypage
for i, page in enumerate(doc):
    # Renderiza la página como imagen (aumentando resolución)
    pix = page.get_pixmap(dpi=300)
    img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    
    # Convierte a escala de grises
    gray = cv2.cvtColor(img_data, cv2.COLOR_RGB2GRAY)

    # (Opcional) Umbralizar para mejorar OCR
    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)

    # Convierte imagen para OCR
    pil_img = Image.fromarray(thresh)

    # Ejecuta OCR con layout de columnas
    text = pytesseract.image_to_string(pil_img, lang="spa", config="--psm 6")  # psm 6: Assume a uniform block of text

    pages.append(text)

    print(f"\n--- Página {i+1} ---\n")
    print(text)



--- Página 1 ---

CONGRESO DE LA REPÚBLICA DEL PERÚ

Segunda Legislatura Ordinaria 2024-2025
Sesión del 20 de marzo de 2025
ASISTENCIA: — Fecha: 20/03/2025 Hora: 06:50 pm
APP ACUÑA PERALTA, MARÍA GRIMANEZA PRE BS FLORES RAMÍREZ, ALEX RANDU PRE 8M PAREDES GONZALES, ALEX ANTONIO PRE
HYD ACUÑA PERALTA, SEGUNDO HÉCTOR PRE FP FLORES RUIZ, VÍCTOR SEFERINO PRE BDP PAREDES PIQUÉ, SUSEL ANA MARÍA PRE
PL AGUERO GUTIÉRREZ, MARÍA ANTONIETA PRE APP GARCÍA CORREA, IDELSO MANUEL LE BS PARIONA SINCHE, ALFREDO PRE
FP AGUINAGA RECUENCO, ALEJANDRO PRE PL GONZA CASTILLO, AMÉRICO PRE sP PAZO NUNURA, JOSÉ BERNARDO PRE
PP ALCARRAZ AGUERO, YOREL KIRA LP AP-PIS GONZALES DELGADO, DIANA CAROLINA — PRE PP PICÓN QUEDO, LUIS RAÚL PRE
FP ALEGRÍA GARCÍA, ARTURO PRE EM GUTIÉRREZ TICONA, PAUL SILVIO LP PL PORTALATINO ÁVALOS, KELLY ROXANA — aus
NA ALVA PRIETO, MARÍA DEL CARMEN PRE APP HETDINGER BALLESTEROS, NELCY PRE AP PORTERO LÓPEZ, HILDA MARLENY LP
AP ALVA ROJAS, CARLOS ENRIQUE PRE RP HERRERA MEDINA, NOELIA ROSSVITH

In [None]:
vote_page = pages[1]

vote_page = vote_page.replace("\n", " ")

#print (vote_page)

#jaro-winckler

parties_abbv = [" AP ", " AP-PIS ", " APP ", " BM ", " BDP ", " BOP ", " BS ", " BSS ", " E "," FP ", " HYD ", " JP ", " IJPP-VP "," JPP-VP ", 
                " NA ", " NP ", " PL ", " PLG ", " PM ", " PP ", " SP ", " sP ", " RP "] #might need to review and make sure everyone is here

sorted_parties = sorted(parties_abbv, key=len, reverse=True)
pattern = r'(?=' + '|'.join(re.escape(party) for party in sorted_parties) + r')'

print (sorted_parties)

[' IJPP-VP ', ' AP-PIS ', ' JPP-VP ', ' APP ', ' BDP ', ' BOP ', ' BSS ', ' HYD ', ' PLG ', ' AP ', ' BM ', ' BS ', ' FP ', ' JP ', ' NA ', ' NP ', ' PL ', ' PM ', ' PP ', ' SP ', ' sP ', ' RP ', ' E ']


In [75]:
politician_list = re.split(pattern, vote_page)

string_list = []

for string in politician_list:
    if string[:4] or string[:5] in sorted_parties:
        string_list.append(string[1:])

string_list = string_list[1:131]

string_list[129] = string_list[129].split("¿")[0].strip()

print (string_list)

['APP ACUÑA PERALTA, MARÍA GRIMANEZA SI 4+++', 'BS FLORES RAMÍREZ, ALEX RANDU NO ---', 'BM PAREDES GONZALES, ALEX ANTONIO SI +++', 'HYD ACUÑA PERALTA, SEGUNDO HÉCTOR SI +++', 'FP FLORES RUIZ, VÍCTOR SEFERINO SI +++', 'BDP PAREDES PIQUÉ, SUSEL ANA MARÍA NO ---', 'PL AGUERO GUTIÉRREZ, MARÍA ANTONIETA SI +++', 'APP GARCÍA CORREA, IDELSO MANUEL LE', 'BS PARIONA SINCHE, ALFREDO NO ---', 'FP AGUINAGA RECUENCO, ALEJANDRO SI ++', 'PL GONZA CASTILLO, AMÉRICO SI +++', 'SP PAZO NUNURA, JOSÉ BERNARDO SI +++', 'PP ALCARRAZ AGUERO, YOREL KIRA LP', 'AP-PIS GONZALES DELGADO, DIANA CAROLINA — SI +++', 'PP PICÓN QUEDO, LUIS RAÚL NO ---', 'FP ALEGRÍA GARCÍA, ARTURO SI +++', 'BM GUTIÉRREZ TICONA, PAUL SILVIO LP', 'PL PORTALATINO ÁVALOS, KELLY ROXANA aus', 'NA ALVA PRIETO, MARÍA DEL CARMEN SI +++', 'APP HEIDINGER BALLESTEROS, NELCY SI +++', 'AP PORTERO LÓPEZ, HILDA MARLENY LP', 'AP ALVA ROJAS, CARLOS ENRIQUE SI +++', 'RP HERRERA MEDINA, NOELIA ROSSVITH — SI +4+', 'BM QUIROZ BARBOZA, SEGUNDO TEODOMIRO SI ++

In [98]:
vote_results = ["SI", "NO", "Abst.", "SinRes", "aus", "LO",
                "LE", "LP", "Com", "CEI", "JP", "Ban", "Sus", "F"]

final_list = []

for poltician_vote in string_list:
    return_dict = {}
    politician_name = ""
    end_vote = None
    vote_as_list = re.split(", | ", poltician_vote)
    party = vote_as_list[0]
    return_dict["party"] = party
    potential_name = vote_as_list[1:5]
    if len(potential_name[3]) > 3:
        first = potential_name[:2]
        last = potential_name[2:]
        return_name = last + first
        politician_name = " ".join(return_name)
    else:
        first = potential_name[:2]
        last = [potential_name[2]]
        return_name = last + first
        politician_name = " ".join(return_name)
    return_dict["name"] = politician_name
    for entry in vote_as_list[4:]:
        if entry in vote_results:
            return_dict["vote"] = entry
            break
    final_list.append(return_dict)

print (final_list)

[{'party': 'APP', 'name': 'MARÍA GRIMANEZA ACUÑA PERALTA', 'vote': 'SI'}, {'party': 'BS', 'name': 'ALEX RANDU FLORES RAMÍREZ', 'vote': 'NO'}, {'party': 'BM', 'name': 'ALEX ANTONIO PAREDES GONZALES', 'vote': 'SI'}, {'party': 'HYD', 'name': 'SEGUNDO HÉCTOR ACUÑA PERALTA', 'vote': 'SI'}, {'party': 'FP', 'name': 'VÍCTOR SEFERINO FLORES RUIZ', 'vote': 'SI'}, {'party': 'BDP', 'name': 'SUSEL PAREDES PIQUÉ', 'vote': 'NO'}, {'party': 'PL', 'name': 'MARÍA ANTONIETA AGUERO GUTIÉRREZ', 'vote': 'SI'}, {'party': 'APP', 'name': 'IDELSO MANUEL GARCÍA CORREA', 'vote': 'LE'}, {'party': 'BS', 'name': 'ALFREDO PARIONA SINCHE', 'vote': 'NO'}, {'party': 'FP', 'name': 'ALEJANDRO AGUINAGA RECUENCO', 'vote': 'SI'}, {'party': 'PL', 'name': 'AMÉRICO GONZA CASTILLO', 'vote': 'SI'}, {'party': 'SP', 'name': 'JOSÉ BERNARDO PAZO NUNURA', 'vote': 'SI'}, {'party': 'PP', 'name': 'YOREL KIRA ALCARRAZ AGUERO', 'vote': 'LP'}, {'party': 'AP-PIS', 'name': 'DIANA CAROLINA GONZALES DELGADO', 'vote': 'SI'}, {'party': 'PP', 'nam

In [99]:
with open("bill_vote.json", "w") as json_file:
    json.dump(final_list, json_file)