In [1]:
from pathlib import Path
from PIL import Image
import fitz  # PyMuPDF für PDF-Erstellung
import io
import torch
import csv
import time
from transformers import AutoProcessor, VisionEncoderDecoderModel, StoppingCriteria, StoppingCriteriaList
from evaluate_metrics_latex import MetricsEvaluatorLatex

# Model laden
processor = AutoProcessor.from_pretrained("facebook/nougat-small")
model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-small")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Verzeichnisse definieren
base_directory = Path.cwd()
groundtruth_path = Path("/bachelor/nougat_ori/nougat/latex_data/ground_truth.json")
image_input_dir = Path("/bachelor/nougat_ori/nougat/latex_data/input_data")
output_pdfs_directory = Path("/bachelor/nougat_ori/nougat/latex_data/pdf_latex")
output_text_directory = Path("/bachelor/nougat_ori/nougat/latex_data/markdown_latex")
output_csv_path = Path("/bachelor/nougat_ori/nougat/latex_data/ocr_results.csv")
output_text_directory.mkdir(parents=True, exist_ok=True)

evaluator = MetricsEvaluatorLatex(groundtruth_path)

# Bilder in PDFs umwandeln
def images_to_pdfs(image_dir, output_dir):
    """Konvertiert alle Bilder in einem Verzeichnis zu PDFs."""
    for image_path in sorted(image_dir.glob("*.png")):
        pdf_path = output_dir / f"{image_path.stem}.pdf"
        try:
            image = Image.open(image_path)
            pdf_bytes = io.BytesIO()
            image.save(pdf_bytes, format="PDF")
            with open(pdf_path, "wb") as f:
                f.write(pdf_bytes.getvalue())
            print(f"✅ PDF erstellt: {pdf_path}")
        except Exception as e:
            print(f"❌ Fehler beim Konvertieren von {image_path}: {e}")

print("🔄 Konvertiere Eingabebilder zu PDFs...")
images_to_pdfs(image_input_dir, output_pdfs_directory)

# PDF in Bilder umwandeln
def rasterize_paper(pdf: Path, dpi: int = 96, return_pil: bool = True, pages: None = None):
    pillow_images = []
    try:
        pdf = fitz.open(pdf)
        if pages is None:
            pages = range(len(pdf))
        for i in pages:
            page_bytes = pdf[i].get_pixmap(dpi=dpi).pil_tobytes(format="PNG")
            if return_pil:
                pillow_images.append(io.BytesIO(page_bytes))
    except Exception as e:
        print(f"Fehler beim Rasterisieren von {pdf}: {e}")
    return pillow_images

# OCR-Prozess starten
ocr_results = []
for pdf_path in output_pdfs_directory.glob("*.pdf"):
    print(f"Verarbeite Datei: {pdf_path}")

    if not pdf_path.exists():
        print(f"❌ Fehler: Die Datei {pdf_path} existiert nicht.")
        continue

    images = rasterize_paper(pdf=pdf_path, return_pil=True)
    print(f"📸 Extrahierte Bilder aus {pdf_path}: {len(images) if images else 0}")

    if not images:
        print(f"❌ Fehler: Keine Bilder aus {pdf_path} extrahiert.")
        continue

    try:
        image = Image.open(images[0])

        # 🔹 Inferenzzeit messen
        start_time = time.time()
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        outputs = model.generate(
            pixel_values.to(device),
            min_length=1,
            max_length=10000,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
            output_scores=True,
            stopping_criteria=StoppingCriteriaList([]),
        )
        end_time = time.time()
        inference_time = round(end_time - start_time, 2)  # 🔹 Berechnung der Inferenzzeit

        generated = processor.batch_decode(outputs[0], skip_special_tokens=True)[0]
        print(f"📝 OCR-Ergebnis für {pdf_path.stem}: {generated[:100]}...")

        ground_truth = evaluator.ground_truth_data.get(f"{pdf_path.stem}.png", "")
        cer = evaluator.calculate_cer(generated, ground_truth) if ground_truth else "N/A"
        wer = evaluator.calculate_wer(generated, ground_truth) if ground_truth else "N/A"

        ocr_results.append({
            "file": f"{pdf_path.stem}.png",
            "result": generated,
            "ground_truth": ground_truth,
            "cer": cer,
            "wer": wer,
            "inference_time": inference_time  # 🔹 Inferenzzeit speichern
        })

        output_path = output_text_directory / f"{pdf_path.stem}.md"
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(f"# Extrahierter Text aus PDF\n\n```{generated}\n```")
        print(f"✅ Markdown gespeichert: {output_path}")

    except Exception as e:
        print(f"❌ Fehler bei der Verarbeitung von {pdf_path}: {e}")

# CSV Datei mit OCR-Ergebnissen erstellen
with open(output_csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["File", "OCR_Result", "Ground_Truth", "CER", "WER", "Latency (s)"])
    for result in ocr_results:
        writer.writerow([
            result["file"],
            result["result"],
            result.get("ground_truth", ""),
            result.get("cer", "N/A"),
            result.get("wer", "N/A"),
            result.get("inference_time", "N/A")  # 🔹 Speichern der Inferenzzeit
        ])

print(f"✅ Metriken gespeichert in: {output_csv_path}")


  from .autonotebook import tqdm as notebook_tqdm
Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    896,
    672
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "qkv_bias": true,
  "transformers_version": "4.46.3",
  "use_absolute_embeddings": false,
  "window_size": 7
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cros

🔄 Konvertiere Eingabebilder zu PDFs...
✅ PDF erstellt: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\1.pdf
✅ PDF erstellt: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\2.pdf
✅ PDF erstellt: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\3.pdf
✅ PDF erstellt: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\4.pdf
Verarbeite Datei: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\1.pdf
📸 Extrahierte Bilder aus C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\pdf_latex\1.pdf: 1
📝 OCR-Ergebnis für 1: Then the results are that afterward:

For every value of \(\lambda\), there is a probability of \(|\...
✅ Markdown gespeichert: C:\Users\altin\PycharmProjects\OCR-Bachelor\bachelor\nougat\nougat\latex_data\markdown_latex\1.md
Verarbeite Datei: C:\Users\altin\Pychar