## Imports

In [1]:
from pathlib import Path
import re
import csv
from typing import Optional, List, Dict, Any
import pandas as pd

# from utilities.funcoes_scrape import 

In [2]:
from pypdf import PdfReader

In [11]:
ROOT_DIR = Path("../../docs_pecas")   # <-- change this


# üîé Flexible match for "especifica√ß√µes t√©cnicas"
ESPEC_TEC_RX = re.compile(
    r"especifica(?:c|√ß)(?:a|√£|o|√µ)?es?\s*t[e√©]cnic(?:a|as|o|os)?",
    re.IGNORECASE
)


_CADERNO_RX = re.compile(
    r"(?:"
    r"(?<![A-Za-z0-9])caderno(?:s)?(?![A-Za-z0-9])"                        # standalone "caderno(s)"
    r"|"                                                                    # or
    r"(?<![A-Za-z0-9])caderno(?:s)?(?:[\s_]+(?:de[\s_]+)?encargo(?:s)?)(?![A-Za-z0-9])"  # caderno de encargos
    r"|"                                                                    # or CE between underscores / edges
    r"(?:(?:^|_)ce(?:_|$))"
    r")",
    re.IGNORECASE

)

# Matches:
# - "programa" or "programas" (standalone)
# - optional tail: "do procedimento" or "do concurso"
_PROGRAMA_RX = re.compile(
    r"(?<![A-Za-z0-9])programa(?:s)?(?:[\s_]+(?:do[\s_]+)?(?:procedimento|concurso))?(?![A-Za-z0-9])"
)

In [12]:
def is_identifier_folder(p: Path) -> bool:
    return p.is_dir() and p.name.isdigit()


In [13]:
def find_matching_pdfs(folder: Path, rx: re.Pattern) -> List[Path]:
    return sorted(
        [
            f for f in folder.iterdir()
            if f.is_file() and f.suffix.lower() == ".pdf" and rx.search(f.name)
        ],
        key=lambda x: x.name.lower()
    )

In [14]:
def count_pdf_pages(pdf_path: Path) -> Optional[int]:
    try:
        reader = PdfReader(str(pdf_path))
        return len(reader.pages)
    except Exception:
        return None


In [15]:
def pdf_contains_pattern(pdf_path: Path, rx: re.Pattern) -> Optional[bool]:
    """
    True  -> pattern found
    False -> pattern not found
    None  -> text extraction failed (scanned PDF)
    """
    try:
        reader = PdfReader(str(pdf_path))
        for page in reader.pages:
            text = page.extract_text() or ""
            if rx.search(text):
                return True
        return False
    except Exception:
        return None


In [16]:
results: List[Dict[str, Any]] = []

id_folders = sorted(
    [p for p in ROOT_DIR.iterdir() if is_identifier_folder(p)],
    key=lambda x: int(x.name)
)

for folder in id_folders:
    identifier = folder.name

    programa_pdfs = find_matching_pdfs(folder, _PROGRAMA_RX)
    caderno_pdfs = find_matching_pdfs(folder, _CADERNO_RX)

    programa_pdf = programa_pdfs[0] if programa_pdfs else None
    caderno_pdf = caderno_pdfs[0] if caderno_pdfs else None

    programa_pages = count_pdf_pages(programa_pdf) if programa_pdf else None
    caderno_pages = count_pdf_pages(caderno_pdf) if caderno_pdf else None

    espec_found = None
    if caderno_pdf:
        espec_found = pdf_contains_pattern(caderno_pdf, ESPEC_TEC_RX)

    results.append({
        "identifier": identifier,

        "programa_pdf": programa_pdf.name if programa_pdf else "",
        "programa_pages": programa_pages,
        "programa_match_count": len(programa_pdfs),

        "caderno_pdf": caderno_pdf.name if caderno_pdf else "",
        "caderno_pages": caderno_pages,
        "caderno_match_count": len(caderno_pdfs),

        "caderno_has_especificacoes_tecnicas":
            "" if not caderno_pdf else
            ("EXTRACTION_FAILED" if espec_found is None else espec_found)
    })


Multiple definitions in dictionary at byte 0x1ecf79 for key /AcroForm
Multiple definitions in dictionary at byte 0x1ecf79 for key /AcroForm
parsing for Object Streams
parsing for Object Streams
Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 18 65536 (offset 0)
Ignoring wrong pointing object 31 65536 (offset 0)
Ignoring wrong pointing object 37 65536 (offset 0)
Ignoring wrong pointing object 43 65536 (offset 0)
Ignoring wrong pointing object 49 65536 (offset 0)
Ignoring wrong pointing object 55 65536 (offset 0)
Ignoring wrong pointing object 61 65536 (offset 0)
Ignoring wrong pointing object 67 65536 (offset 0)
Ignoring wrong pointing object 73 65536 (offset 0)
Ignoring wrong pointing object 79 65536 (offset 0)
Ignoring wrong pointing object 85 65536 (offset 0)
Ignoring wrong pointing object 91 65536 (offset 0)
Ignoring wrong pointing object 97 65536 (offset 0)
Ignoring wrong pointing object 103 65536 (offset 0)
Ignoring wrong pointing object 109 65536 

In [17]:
df = pd.DataFrame(results)
df

Unnamed: 0,identifier,programa_pdf,programa_pages,programa_match_count,caderno_pdf,caderno_pages,caderno_match_count,caderno_has_especificacoes_tecnicas
0,4508214,,,0,1_Caderno_de_Encargos.pdf,46.0,1,True
1,4981938,,,0,1_CE_DEGA_ENG_Fisc22.pdf,144.0,1,True
2,5102392,2_4_2021_douma_programa_procedimento.pdf,21.0,1,1_4_2021_douma_caderno_de_encargos.pdf,23.0,1,False
3,5373973,,,0,1_Caderno_Encargos_Seguran√ßa.pdf,27.0,2,False
4,5563634,,,0,1_3_Caderno_Encargos.pdf,21.0,1,True
...,...,...,...,...,...,...,...,...
5676,6832611,,,0,1_Caderno_encargos.pdf,20.0,1,False
5677,6834332,,,0,1_Caderno_de_Encargos_Eletricidade_final.pdf,24.0,1,True
5678,6834454,,,0,1_280_Caderno_de_Encargos.pdf,21.0,1,True
5679,6846134,,,0,1_P178_2023_Caderno_de_Encargos_ASSINADO.pdf,57.0,1,True


In [23]:
df.to_csv("../../data/results_reading_pdfs.csv")