In [None]:
import re
from typing import Dict, Any
from .normalizer import clean_text, norm_money, norm_int, norm_combustible

VIN_RE = re.compile(r'\b(?![IOQ])[A-HJ-NPR-Z0-9]{17}\b')
RUC_RE = re.compile(r'\b\d{13}\b')
FACT_RE = re.compile(r'\b\d{3}-\d{3}-\d{9}\b')
DATE_RE = re.compile(r'\b(?:(\d{4})[-/\.]?(\d{2})[-/\.]?(\d{2})|(\d{2})[-/\.]?(\d{2})[-/\.]?(\d{4}))\b')
MONEY_RE = re.compile(r'(?<!\d)(?:\$?\s*)?(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})|\d+[.,]\d{2})(?!\d)')

BRANDS = ['TOYOTA','CHEVROLET','JAC','HYUNDAI','KIA','NISSAN','FORD','VOLKSWAGEN','MERCEDES','ISUZU','JEEP']
COLORS = ['NEGRO','ROJO','AZUL','BLANCO','PLATA','PLOMO','GRIS','AMARILLO','VERDE']

In [None]:
class FieldExtractor:
    def __init__(self):
        pass

    def _extract_date(self, text: str):
        for m in DATE_RE.finditer(text):
            if m.group(1):
                y,mn,d = m.group(1), m.group(2), m.group(3)
            else:
                d,mn,y = m.group(4), m.group(5), m.group(6)
            try:
                return f"{int(y):04d}-{int(mn):02d}-{int(d):02d}"
            except:
                continue
        return None

In [None]:
    def extract(self, pages):
        text = clean_text(' '.join(pages)).upper()
        out: Dict[str, Any] = {
            "MARCA": None, "MODELO": None,
            "MODELO_HOMOLOGADO_ANT": None, "MODELO_REGISTRADO_SRI": None,
            "MOTOR": None, "CLASE": None, "TIPO": None, "COLOR": None, "AÑO": None,
            "RAMV_CPN": None, "VIN_CHASIS": None,
            "COMBUSTIBLE": None, "PAIS_ORIGEN": None, "RUEDAS": None, "EJES": None,
            "CAPACIDAD": None, "TONELAJE": None, "CILINDRAJE": None,
            "CONCESIONARIA": None, "RUC": None,
            "NUMERO_FACTURA": None, "FECHA_DOCUMENTO": None,
            "DIRECCION": None, "NOMBRE_CLIENTE": None,
            "SUBTOTAL": None, "SUBSIDIO": None, "DESCUENTO": None, "IVA": None, "TOTAL": None
        }

        vin = VIN_RE.search(text)
        if vin: out["VIN_CHASIS"] = vin.group(0)

        ruc = RUC_RE.search(text)
        if ruc: out["RUC"] = ruc.group(0)

        nf = FACT_RE.search(text)
        if nf: out["NUMERO_FACTURA"] = nf.group(0)

        out["FECHA_DOCUMENTO"] = self._extract_date(text)

        for kw in ["TOYOCOSTA", "CASABACA", "ECUA-AUTO", "IBAMOTORS", "ASSA", "ANETA", "FEHIERRO", "SERVIENTREGA"]:
            if kw in text:
                out["CONCESIONARIA"] = kw + (" S.A." if "S.A" not in kw else "")
                break

        for b in BRANDS:
            if b in text:
                out["MARCA"] = b
                break

        for c in COLORS:
            if re.search(rf'\b{c}\b', text):
                out["COLOR"] = c
                break

In [None]:
        if "DIESEL" in text: out["COMBUSTIBLE"] = "DIESEL"
        elif "EXTRA" in text: out["COMBUSTIBLE"] = "EXTRA"
        elif "GASOLIN" in text: out["COMBUSTIBLE"] = "GASOLINA"

        m = re.search(r'MODELO[:\s]+([A-Z0-9 \-\/\.\*]+)', text)
        if m:
            out["MODELO"] = m.group(1).strip()[:80]

        m2 = re.search(r'MOTOR[:\s]+([A-Z0-9\*]+)', text)
        if m2:
            out["MOTOR"] = m2.group(1)

        a = re.search(r'\b(19|20)\d{2}\b', text)
        if a:
            try:
                out["AÑO"] = int(a.group(0))
            except: pass

        def find_money(label):
            patt = re.compile(rf'{label}[:\s\$]*([0-9\.\, ]+)', re.IGNORECASE)
            m = patt.search(text)
            if m:
                return norm_money(m.group(1))
            return None

        out["SUBTOTAL"] = find_money("SUBTOTAL")
        out["IVA"] = find_money("IVA")
        out["DESCUENTO"] = find_money("DESCUENTO")
        out["TOTAL"] = find_money("TOTAL")

        return out