In [1]:
# pip install pikepdf pypdf

In [2]:
keep_blank_chars = True

In [11]:
from pypdf import PdfReader

local_pdf = "1766296847_25749488007978009.pdf"

all_text: list[str] = []
try:
    reader = PdfReader(str(local_pdf))

    for page in reader.pages:
        text = page.extract_text() or ""

        if keep_blank_chars:
            all_text.append(text)
        else:
            all_text.append("\n".join(line for line in text.splitlines() if line.strip()))

except Exception as e:
    print(e)
    raise

In [12]:
full_text = "\n".join(all_text)

In [13]:
if "BBVA MEXICO, S.A., INSTITUCION DE BANCA MULTIPLE, GRUPO FINANCIERO BBVA MEXICO" in full_text:
    print(244)

244


In [5]:
import re

start = r"Detalle de Movimientos Realizados"
end = r"Total de Movimientos"

pattern = re.compile(
    rf"{start}(.*?){end}",
    re.DOTALL | re.IGNORECASE
)

match = pattern.search(full_text)

if not match:
    raise ValueError("No se encontró el bloque de movimientos en el texto.")

movimientos_text = match.group(1).strip()

In [6]:
movimientos_text

'FECHA SALDO\nOPER LIQ DESCRIPCION REFERENCIA CARGOS ABONOS OPERACION LIQUIDACION\n17/OCT 17/OCT SPEI ENVIADO TESORED 1,300.00 17,698.15 17,698.15\n   0109250global  Referencia 0060662858 703\n   00703458043342215634\n   MBAN01002510170060662858\n   global\n19/OCT 20/OCT SPEI ENVIADO TESORED 2,000.00 15,698.15 17,698.15\n   0810250global  Referencia 0072540662 703\n\n   00703458043342215634\n   MBAN01002510200072540662\n   global\n20/OCT 20/OCT SPEI ENVIADO BANAMEX 2,900.00 12,798.15 12,798.15\n   0810250asesorias  Referencia 0077461651 002\n   00002180701119314781\n   MBAN01002510200077461651\n   Asesorias\n21/OCT 21/OCT DEPOSITO EN EFECTIVO 800.00 13,598.15 13,598.15\n  Referencia 1487VOLUCION\n27/OCT 27/OCT PAGO CUENTA DE TERCERO 60.00 13,538.15 13,538.15\n   BNET 1562585698 Transf a ANEL  Referencia 0030983934\n30/OCT 30/OCT SPEI ENVIADO BANORTE 95.00 13,443.15 13,443.15\n   1810250agua  Referencia 0074739620 072\n   00072180005952164400\n   MBAN01002510300074739620\n   Urbanista\n

In [7]:
import re
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class Movement:
    date_op: str
    date_val: str
    concept: str
    amount: str
    balance: str
    balance2: Optional[str]
    details: str

MONTHS = "ENE|FEB|MAR|ABR|MAY|JUN|JUL|AGO|SEP|OCT|NOV|DIC"

# Cabecera: 20/OCT 20/OCT CONCEPTO .... 2,900.00 12,798.15 12,798.15
HEADER_RE = re.compile(
    rf"^\s*(\d{{2}}/(?:{MONTHS}))\s+(\d{{2}}/(?:{MONTHS}))\s+(.+?)\s+"
    r"([-]?\d{1,3}(?:,\d{3})*\.\d{2})\s+"
    r"(\d{1,3}(?:,\d{3})*\.\d{2})"
    r"(?:\s+(\d{1,3}(?:,\d{3})*\.\d{2}))?\s*$"
)

def extract_movements(movimientos_text: str) -> List[Movement]:
    lines = movimientos_text.splitlines()

    movements: List[Movement] = []
    current = None
    detail_lines = []

    for line in lines:
        m = HEADER_RE.match(line)
        if m:
            # cerrar el anterior
            if current is not None:
                movements.append(Movement(
                    date_op=current["date_op"],
                    date_val=current["date_val"],
                    concept=current["concept"],
                    amount=current["amount"],
                    balance=current["balance"],
                    balance2=current["balance2"],
                    details="\n".join(detail_lines).strip()
                ))

            # abrir nuevo
            current = {
                "date_op": m.group(1),
                "date_val": m.group(2),
                "concept": m.group(3).strip(),
                "amount": m.group(4),
                "balance": m.group(5),
                "balance2": m.group(6),
            }
            detail_lines = []
        else:
            # líneas detalle (incluye vacías; si querés descartarlas, filtrá acá)
            if current is not None:
                detail_lines.append(line.rstrip())

    # cerrar último
    if current is not None:
        movements.append(Movement(
            date_op=current["date_op"],
            date_val=current["date_val"],
            concept=current["concept"],
            amount=current["amount"],
            balance=current["balance"],
            balance2=current["balance2"],
            details="\n".join(detail_lines).strip()
        ))

    return movements

In [8]:
import pandas as pd

movs = extract_movements(movimientos_text)

df = pd.DataFrame([m.__dict__ for m in movs])
df.head(20)

Unnamed: 0,date_op,date_val,concept,amount,balance,balance2,details
0,17/OCT,17/OCT,SPEI ENVIADO TESORED,1300.0,17698.15,17698.15,0109250global Referencia 0060662858 703\n 0...
1,19/OCT,20/OCT,SPEI ENVIADO TESORED,2000.0,15698.15,17698.15,0810250global Referencia 0072540662 703\n\n ...
2,20/OCT,20/OCT,SPEI ENVIADO BANAMEX,2900.0,12798.15,12798.15,0810250asesorias Referencia 0077461651 002\n ...
3,21/OCT,21/OCT,DEPOSITO EN EFECTIVO,800.0,13598.15,13598.15,Referencia 1487VOLUCION
4,27/OCT,27/OCT,PAGO CUENTA DE TERCERO,60.0,13538.15,13538.15,BNET 1562585698 Transf a ANEL Referencia 0030...
5,30/OCT,30/OCT,SPEI ENVIADO BANORTE,95.0,13443.15,13443.15,1810250agua Referencia 0074739620 072\n 000...
6,03/NOV,03/NOV,COBRO AUTOMATICO RECIBO,2078.36,11364.79,11364.79,PREST. 9626713013 20251101 Referencia 9626713013
7,04/NOV,04/NOV,SPEI RECIBIDONU MEXICO,25771.65,37136.44,37136.44,0041125Transferencia Referencia 0174070378 63...
8,05/NOV,05/NOV,PAGO TARJETA DE CREDITO,34186.87,2949.57,2949.57,CUENTA: BMOV Referencia 1324316117
9,07/NOV,07/NOV,SPEI ENVIADO BANORTE,95.0,2854.57,2854.57,1810250agua Referencia 0066257292 072\n 000...
