In [4]:
# %pip install pdf2image
# %pip install pytesseract
# %pip install Pillow

Vamos a hacer un análisis comparativo de los diferentes métodos para procesar documentos financieros, enfocándonos en costes, eficiencia y tiempos. Compararemos el método tradicional de OCR con los enfoques basados en IA como Gemini y OpenAI GPT-4 y la ejecución del modelo Llama 3.1 en local:

### 1. Método tradicional de OCR:


In [5]:
import time
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

def ocr_process(pdf_path):
    start_time = time.time()
    
    # Convertir PDF a imagen
    pages = convert_from_path(pdf_path, 300)
    image = pages[0]
    
    # Realizar OCR
    text = pytesseract.image_to_string(image)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return text, processing_time

# Prueba
pdf_path = "data/invoice_0.pdf"
text, time_taken = ocr_process(pdf_path)
print(f"Tiempo de procesamiento OCR: {time_taken:.2f} segundos")
print(f"Caracteres extraídos: {len(text)}")
print(text)

Tiempo de procesamiento OCR: 2.53 segundos
Caracteres extraídos: 1317
omitsis

digital challenges

Omitsis Consulting SL
B64967979

Casp 162, 2 (Edificio Fain)
08013 Barcelona

Spain

Facturar a

Albert Gil Lopez
47799547G

Carrer Anoia, 45-47
08790 Gelida (Barcelona)

Articulo & Descripcién

FACTURA

N.° de factura 20240775

Saldo adeudado

0,00€
Fecha factura : 02/08/2024
Vencimiento : 01/09/2024

1 Solucié conflictes Mappacces - UAB

Precio Cantidad

2,00 60,00 120,00
Subtotal 120,00

IVA (21%) 25,20

Total 145,20€

Pago realizado (-) 145,20
Saldo adeudado 0,00€

Realizar ingreso a numero de cuenta: ES37 0182 6308 7002 0016 4749 Swift: BBVAESMM (indicando numero de factura)

For international payments send the transfer at IBAN: LT28 3250 0114 9657 4589 - BIC: REVOLT21 - BIC intermediario:

CHASDEFX ( Write the invoice number in the concept of the transfer )

Registro Mercantil de Barcelona. Tomo 40838, Folio 57, Hoja B 376893, Inscripcidn 1a.

Sus datos seran incorporados en un Fich

### 2. Método basado en Gemini:

In [6]:
import time
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from llama_index.core.schema import ImageDocument
from PIL import Image
import os
import base64
from io import BytesIO
from pdf2image import convert_from_path

class InvoiceData(BaseModel):
    invoice_number: str
    total_amount: float

def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, 300)

def gemini_process(file_path):
    start_time = time.time()
    
    gemini_llm = GeminiMultiModal(api_key="YOUR-API-KEY", model_name="models/gemini-1.5-flash")
    
    if file_path.lower().endswith('.pdf'):
        images = pdf_to_images(file_path)
    else:
        images = [Image.open(file_path)]
    
    results = []
    for image in images:
        # Convertir la imagen a base64
        image_base64 = image_to_base64(image)
        
        # Crear ImageDocument con la imagen en base64
        image_document = ImageDocument(
            image_path=file_path,
            image=image_base64
        )
        
        extraction_program = MultiModalLLMCompletionProgram.from_defaults(
            output_parser=PydanticOutputParser(InvoiceData),
            image_documents=[image_document],
            prompt_template_str="Extract invoice details from this image:",
            multi_modal_llm=gemini_llm,
        )
        
        result = extraction_program()
        results.append(result)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return results, processing_time

# Prueba
file_path = "data/invoice_0.pdf"
file_path = os.path.abspath(file_path)

results, time_taken = gemini_process(file_path)
print(f"Tiempo de procesamiento Gemini: {time_taken:.2f} segundos")
for result in results:
    print(f"Datos extraídos: {result}")

Tiempo de procesamiento Gemini: 5.96 segundos
Datos extraídos: invoice_number='20240775' total_amount=145.2


### 3. Método basado en OpenAI GPT-4:


In [7]:
# %pip install fitz
# %pip install frontend

In [9]:
import time
from openai import OpenAI
import base64
from pdf2image import convert_from_path
import os
from PIL import Image
from io import BytesIO
import json

from dotenv import load_dotenv
load_dotenv()

client = OpenAI()
MODEL = "gpt-4o-2024-08-06"

invoice_extractor_prompt = '''
    You are a helpful invoice information extractor. You will be provided with an image of an invoice,
    and your goal will be to extract the invoice number and total amount.
    Provide the extracted information in a structured format, including steps of your reasoning process.
'''

def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

def get_invoice_info(image_content):
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "system", 
                "content": invoice_extractor_prompt
            },
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text", 
                        "text": "Extract the invoice number and total amount from this image."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_content}"
                        }
                    }
                ]
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "invoice_extraction",
                "schema": {
                    "type": "object",
                    "properties": {
                        "invoice_number": {
                            "type": "string",
                            "description": "The number of the invoce",
                        },
                        "total_amount" : {
                            "type": "number",
                            "description": "The total amount of the invoice"
                        }
                    },
                    "required": ["invoice_number", "total_amount"],

                    "additionalProperties": False
                },
                "strict": True
            }
        }
    )

    return response.choices[0].message

def gpt4_process(file_path):
    start_time = time.time()
    
    if file_path.lower().endswith('.pdf'):
        images = pdf_to_images(file_path)
    else:
        images = [Image.open(file_path)]
    
    results = []
    for image in images:
        base64_image = encode_image(image)
        result = get_invoice_info(base64_image)
        results.append(result)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return results, processing_time

# Prueba
file_path = "data/invoice_0.pdf"
result, time_taken = gpt4_process(file_path)
print(f"Tiempo de procesamiento GPT-4: {time_taken:.2f} segundos")
for invoice_info in result:
    print(f"Datos extraídos: {invoice_info.content}")

Tiempo de procesamiento GPT-4: 6.06 segundos
Datos extraídos: {"invoice_number":"20240775","total_amount":145.2}


### 4. Método basado en un LLM open source en ejecución local

In [11]:
# %pip install llama-index-llms-ollama

In [10]:
from llama_index.llms.ollama import Ollama

LLM_MODEL = "llama3.1"
OLLAMA_BASE_URL = "http://localhost:11434"

def image_to_text(image):
    return pytesseract.image_to_string(image)

def local_llm_process(file_path):
    start_time = time.time()
    llm = Ollama(model=LLM_MODEL, base_url=OLLAMA_BASE_URL, temperature=0)
    
    if file_path.lower().endswith('.pdf'):
        images = pdf_to_images(file_path)
    else:
        images = [Image.open(file_path)]
    
    results = []
    for image in images:
        # Convertir la imagen a texto usando OCR
        image_text = image_to_text(image)
        
        prompt = f"""
        Extract the invoice number and total amount from the following invoice text:
        {image_text}

        Your response must be ONLY a valid JSON object with the following structure:
        {{"invoice_number": "string", "total_amount": number}}

        Do not include any explanations or additional text. Only the JSON object is allowed in your response.
        If you cannot extract the information, return {{"invoice_number": "", "total_amount": 0}}.
        """
        
        response = llm.complete(prompt)
        results.append(response.text.strip())
    
    end_time = time.time()
    return results, end_time - start_time

# Ejemplo de uso con una factura en la carpeta data
file_path = "data/invoice_0.pdf"
result, time_taken = local_llm_process(file_path)
print(f"Tiempo de procesamiento LLM local: {time_taken:.2f} segundos")
for invoice_info in result:
    print(f"Datos extraídos: {invoice_info}")

Tiempo de procesamiento LLM local: 21.96 segundos
Datos extraídos: {
  "invoice_number": "20240775",
  "total_amount": 145.20
}


In [12]:
import pandas as pd
import os
import logging
import json

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Cargar las facturas de la carpeta data/invoices
logging.info("Cargando las facturas de la carpeta data/invoices")
test_invoices = [os.path.join("data/invoices", f) for f in os.listdir("data/invoices") if f.endswith(".jpg") or f.endswith(".pdf")]

results = []

for invoice in test_invoices:
    logging.info(f"Procesando la factura: {invoice}")
    
    ocr_text, ocr_time = ocr_process(invoice)
    gemini_result, gemini_time = gemini_process(invoice)
    gpt4_result, gpt4_time = gpt4_process(invoice)
    local_llm_result, local_llm_time = local_llm_process(invoice)
    
    results.append({
        "Invoice": invoice,
        "OCR Time": ocr_time,
        "Gemini Time": gemini_time,
        "GPT-4 Time": gpt4_time,
        "Local LLM Time": local_llm_time,
        "OCR Characters": len(ocr_text),
    })

logging.info("Creando DataFrame con los resultados")
df_results = pd.DataFrame(results)
print(df_results)

# Análisis de costes y tiempos
logging.info("Iniciando análisis de costes y tiempos")
ocr_cost = 0.001
google_ocr_cost = 0.0015
gemini_cost = (0.075 + 0.30) / 1000000
gpt4_cost = (0.01 + 0.03) / 1000
local_llm_cost = 0  # Asumimos que el costo de ejecución local es 0

df_results["OCR Cost"] = ocr_cost
df_results["Google OCR Cost"] = google_ocr_cost
df_results["Gemini Cost"] = gemini_cost * (df_results["OCR Characters"] / 4)
df_results["GPT-4 Cost"] = gpt4_cost * (df_results["OCR Characters"] / 4)
df_results["Local LLM Cost"] = local_llm_cost

total_invoices = len(df_results)
total_ocr_time = df_results['OCR Time'].sum()
total_gemini_time = df_results['Gemini Time'].sum()
total_gpt4_time = df_results['GPT-4 Time'].sum()
total_local_llm_time = df_results['Local LLM Time'].sum()

print(f"\nTiempo total de procesamiento para {total_invoices} facturas:")
print(f"OCR: {total_ocr_time:.2f} segundos")
print(f"Gemini: {total_gemini_time:.2f} segundos")
print(f"GPT-4: {total_gpt4_time:.2f} segundos")
print(f"Local LLM: {total_local_llm_time:.2f} segundos")

print(f"\nTiempo estimado para 100 facturas:")
print(f"OCR: {(total_ocr_time / total_invoices) * 100:.2f} segundos")
print(f"Gemini: {(total_gemini_time / total_invoices) * 100:.2f} segundos")
print(f"GPT-4: {(total_gpt4_time / total_invoices) * 100:.2f} segundos")
print(f"Local LLM: {(total_local_llm_time / total_invoices) * 100:.2f} segundos")

print("\nCostes totales:")
print(f"OCR tradicional: ${df_results['OCR Cost'].sum():.4f}")
print(f"OCR (simulando Google): ${df_results['Google OCR Cost'].sum():.4f}")
print(f"Gemini: ${df_results['Gemini Cost'].sum():.4f}")
print(f"GPT-4: ${df_results['GPT-4 Cost'].sum():.4f}")
print(f"Local LLM: ${df_results['Local LLM Cost'].sum():.4f}")

print("\nCostes estimados para 100 facturas:")
print(f"OCR tradicional: ${(df_results['OCR Cost'].sum() / total_invoices) * 100:.4f}")
print(f"OCR (simulando Google): ${(df_results['Google OCR Cost'].sum() / total_invoices) * 100:.4f}")
print(f"Gemini: ${(df_results['Gemini Cost'].sum() / total_invoices) * 100:.4f}")
print(f"GPT-4: ${(df_results['GPT-4 Cost'].sum() / total_invoices) * 100:.4f}")
print(f"Local LLM: ${(df_results['Local LLM Cost'].sum() / total_invoices) * 100:.4f}")

2024-08-18 20:46:48,602 - INFO - Cargando las facturas de la carpeta data/invoices
2024-08-18 20:46:48,604 - INFO - Procesando la factura: data/invoices\invoice_0 (1)-1.pdf
2024-08-18 20:47:00,866 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-18 20:47:12,130 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2024-08-18 20:47:12,132 - INFO - Procesando la factura: data/invoices\invoice_0 (1)-10.pdf
2024-08-18 20:47:25,236 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-18 20:47:38,545 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2024-08-18 20:47:38,547 - INFO - Procesando la factura: data/invoices\invoice_0 (1)-11.pdf
2024-08-18 20:47:49,551 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-18 20:47:59,889 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


                               Invoice  OCR Time  Gemini Time  GPT-4 Time  \
0    data/invoices\invoice_0 (1)-1.pdf  2.210750     4.257391    5.797495   
1   data/invoices\invoice_0 (1)-10.pdf  2.695220     5.615172    4.778575   
2   data/invoices\invoice_0 (1)-11.pdf  2.262222     4.034137    4.708068   
3   data/invoices\invoice_0 (1)-12.pdf  2.295756     4.234058    5.265335   
4   data/invoices\invoice_0 (1)-13.pdf  2.496379     4.229912    5.760017   
5   data/invoices\invoice_0 (1)-14.pdf  2.369840     3.657675    5.188378   
6   data/invoices\invoice_0 (1)-15.pdf  2.100128     4.074603    5.009198   
7   data/invoices\invoice_0 (1)-16.pdf  2.133572     3.856923    5.655701   
8   data/invoices\invoice_0 (1)-17.pdf  2.332016     3.893067    4.914279   
9   data/invoices\invoice_0 (1)-18.pdf  2.650588     4.107609    6.073032   
10  data/invoices\invoice_0 (1)-19.pdf  2.740752     3.615491    4.362728   
11   data/invoices\invoice_0 (1)-2.pdf  2.352780     3.898676    6.049724   