# setup

## dependencies

In [None]:
import base64
import difflib
import getpass
import io
import json
import os
from pathlib import Path
from typing import Callable

import anthropic
import Levenshtein
import openai
import pandas as pd
import plotly.graph_objects as go
import vertexai
from google.cloud import vision
from google.oauth2 import service_account
from jiwer import cer, wer
from PIL import Image
from vertexai.preview.generative_models import GenerativeModel, Part

In [None]:
pd.set_option("display.max_rows", None)

## files

In [None]:
!ls /data/

In [None]:
!ls /data/keys/

In [None]:
!ls /data/texts/

In [None]:
!ls /data/texts/images

In [None]:
with open("/data/keys/tokens.json", "r") as f:
    tokens_dict = json.load(f)

## global vars

In [None]:
# file paths
images_folder = Path("/data/texts/images")
transkribus_inferenced_folder = Path("/data/texts/transkribus_inferenced")
transkribus_corrected_folder = Path("/data/texts/transkribus_corrected")
openai_simple_folder = Path("/data/texts/openai_simple")
openai_extensive_folder = Path("/data/texts/openai_extensive")
google_vision_folder = Path("/data/texts/google_vision")
google_gemini_simple_folder = Path("/data/texts/google_gemini_simple")
google_gemini_extensive_folder = Path("/data/texts/google_gemini_extensive")
anthropic_simple_folder = Path("/data/texts/anthropic_simple")
anthropic_extensive_folder = Path("/data/texts/anthropic_extensive")
plot_path = Path("/data/analysis/plot.png")
df_path = Path("/data/analysis/df.pkl")

# openai
openai_client = openai.OpenAI(api_key=tokens_dict["openai_token"])

# google
google_credentials = service_account.Credentials.from_service_account_file("/data/keys/google_key.json")
google_vision_client = vision.ImageAnnotatorClient(credentials=google_credentials)
vertexai.init(project="project-pressmint-ocr", location="us-central1", credentials=google_credentials)
google_model = GenerativeModel("gemini-2.5-flash-lite")

# anthropic
anthropic_client = anthropic.Anthropic(api_key=tokens_dict["anthropic_token"])

# text comparison metrics

## diff_levenshtein

In [None]:
def diff_levenshtein(text_a: str, text_b: str) -> float:
    lev_dist = Levenshtein.distance(text_a, text_b)
    lev_norm = lev_dist / max(len(text_a), len(text_b))
    return lev_norm

## diff_wer

In [None]:
def diff_wer(text_a: str, text_b: str) -> float:
    return wer(text_a, text_b)

## diff_cer

In [None]:
def diff_cer(text_a: str, text_b: str) -> float:
    return cer(text_a, text_b)

## diff_difflib

In [None]:
def diff_difflib(text_a: str, text_b: str) -> float:
    seq = difflib.SequenceMatcher(None, text_a, text_b)
    return seq.ratio()

## diff_all

In [None]:
def diff_all(text_a: str, text_b: str) -> float:
    return {
        "diff_levenshtein": diff_levenshtein(text_a, text_b),
        "diff_wer": diff_wer(text_a, text_b),
        "diff_cer": diff_cer(text_a, text_b),
        "diff_difflib": diff_difflib(text_a, text_b),
    }

## tests

In [None]:
text_a = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore et dolore.
"""

In [None]:
# Letzte Wörter:
# elat statt elit
# dalare statt dolore
text_b = """
Lorem ipsum dolor sit amet, consectetur adipiscing elat.
Sed do eiusmod tempor incididunt ut labore et dalare.
"""

In [None]:
# o mit a
# i mit e
# d mit x
text_c = """
Larem epsum xalar set amet, cansectetur axepesceng elet.
Sex xa eeusmax tempar encexexunt ut labare et xalare.
"""

In [None]:
# Erster und zweiter Satz vertauscht
text_d = """
Sed do eiusmod tempor incididunt at labore dolore.
Lorem ipsum dolor sit amet consectetur adipiscing elit.
"""

In [None]:
# Gänzlich neue Sätze
text_e = """
Commodo nulla facilisi nullam vehicula ipsum a arcu.
Pulvinar proin gravida hendrerit lectus.
"""

In [None]:
diff_all(text_a, text_a)

In [None]:
diff_all(text_a, text_b)

In [None]:
diff_all(text_a, text_c)

In [None]:
diff_all(text_a, text_d)

In [None]:
diff_all(text_a, text_e)

# reader functions

## get_image_path

In [None]:
def get_image_path(image_id: str) -> Path:
    return images_folder / Path(image_id + ".jpg")

## read_text

In [None]:
def read_text(image_id: str) -> str:
    with open(transkribus_corrected_folder / Path(image_id + ".txt"), "r") as f:
        return f.read()

## read_image_as_string

In [None]:
def read_image_as_string(image_id: str) -> str:
    with open(get_image_path(image_id), "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

## read_image_as_binary

In [None]:
def read_image_as_binary(image_id: str) -> bytes:
    with open(get_image_path(image_id), "rb") as f:
        return f.read()

## read_image_as_gemini_part

In [None]:
def read_image_as_gemini_part(image_id: str) -> bytes:
    img = Image.open(get_image_path(image_id))
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format="JPEG")
    img_byte_arr = img_byte_arr.getvalue()
    image_part = Part.from_data(data=img_byte_arr, mime_type="image/jpeg")
    return image_part

## get_image_ids

In [None]:
def get_image_ids() -> list[str]:
    image_id_list = []
    for image_path in sorted(images_folder.iterdir()):
        if "test" not in str(image_path):
            image_id_list.append(image_path.stem)
    return image_id_list

## tests

In [None]:
print(len(read_image_as_string("test")))

In [None]:
print(len(read_image_as_binary("test")))

In [None]:
for image_id in get_image_ids():
    print(image_id)
    print(len(read_image_as_string(image_id)))

# OCR inferences

## prompts

In [None]:
prompt_simple = "Extrahiere den Text aus diesem Bild."
prompt_extensive = (
    "Das ist ein Scan einer deutschen historischen Zeitung aus dem frühen 20. Jahrhundert."
    "Bitte führe OCR darauf aus. "
    "Beachte dabei, dass die Schrift in Fraktur gehalten ist. "
    "Versuche keine Interpretationen zu machen bezüglich der Wörter, sondern transkripiere jeden Buchstaben wie du ihn siehst. "
    "Ohne irgendwelche Metabeschreibungen, nur den Text alleine."
)

## ocr_transkribus

In [None]:
def ocr_transkribus(image_id: str) -> str:
    with open(transkribus_inferenced_folder / Path(image_id + ".txt"), "r") as f:
        return f.read()

## ocr_openai_base

In [None]:
def ocr_openai_base(prompt: str, image_id: str) -> str:
    image = read_image_as_string(image_id)
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image}",
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
        max_tokens=2048,
    )
    return response.choices[0].message.content

## ocr_openai_prompt_simple

In [None]:
def ocr_openai_prompt_simple(image_id: str) -> str:
    return ocr_openai_base(prompt_simple, image_id)

## ocr_openai_prompt_extensive

In [None]:
def ocr_openai_prompt_extensive(image_id: str) -> str:
    return ocr_openai_base(prompt_extensive, image_id)

## ocr_openai_one_shot

In [None]:
def ocr_openai_one_shot(image_id_ground_truth: str, image_id_inference: str) -> str:
    image_ground_truth = read_image_as_string(image_id_ground_truth)
    image_inference = read_image_as_string(image_id_inference)
    text_ground_truth = read_text(image_id)
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Hier ist ein Beispielbild einer historischen Frakturschrift-Zeitung mit seiner korrekten Transkription:",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_ground_truth}",
                            "detail": "high",
                        },
                    },
                    {"type": "text", "text": text_ground_truth},
                    {"type": "text", "text": "Bitte beschreibe das folgende ähnliche Bild, so gut wie möglich."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_inference}",
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
        max_tokens=2048,
    )
    return response.choices[0].message.content

## ocr_google_vision

In [None]:
def ocr_google_vision(image_id: str) -> str:
    image = vision.Image(content=read_image_as_binary(image_id))
    response = google_vision_client.document_text_detection(image=image)
    document = response.full_text_annotation
    return document.text

## ocr_google_gemini_base

In [None]:
def ocr_google_gemini_base(prompt: str, image_id: str) -> str:
    image_part = read_image_as_gemini_part(image_id)
    response = google_model.generate_content([prompt, image_part])
    return response.text

## ocr_google_gemini_simple

In [None]:
def ocr_google_gemini_simple(image_id: str) -> str:
    return ocr_google_gemini_base(prompt_simple, image_id)

## ocr_google_gemini_extensive

In [None]:
def ocr_google_gemini_extensive(image_id: str) -> str:
    return ocr_google_gemini_base(prompt_extensive, image_id)

## ocr_anthropic_base

In [None]:
def ocr_anthropic_base(prompt: str, image_id: str) -> str:
    image = read_image_as_string(image_id)
    message = anthropic_client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=4000,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image,
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
    )
    return message.content[0].text

## ocr_anthropic_simple

In [None]:
def ocr_anthropic_simple(image_id: str) -> str:
    return ocr_anthropic_base(prompt_simple, image_id)

## ocr_anthropic_extensive

In [None]:
def ocr_anthropic_extensive(image_id: str) -> str:
    return ocr_anthropic_base(prompt_extensive, image_id)

## get_all_ocr_func

In [None]:
def get_all_ocr_func() -> list[Callable]:
    return [
        ocr_transkribus,
        ocr_openai_prompt_simple,
        ocr_openai_prompt_extensive,
        ocr_openai_one_shot,
        ocr_google_vision,
        ocr_google_gemini_simple,
        ocr_google_gemini_extensive,
        ocr_anthropic_simple,
        ocr_anthropic_extensive,
    ]
    # return [ocr_transkribus, ocr_openai_prompt_simple, ocr_openai_prompt_extensive]
    # return [ocr_transkribus, ocr_openai_one_shot]
    # return [ocr_transkribus, ocr_anthropic_simple]
    # return [ocr_transkribus]

## get_ocr_output_folder

In [None]:
def get_ocr_output_folder(ocr_func: Callable) -> str:
    if ocr_func is ocr_transkribus:
        return None
    elif ocr_func is ocr_openai_prompt_simple:
        return openai_simple_folder
    elif ocr_func is ocr_openai_prompt_extensive:
        return openai_extensive_folder
    elif ocr_func is ocr_google_vision:
        return google_vision_folder
    elif ocr_func is ocr_google_gemini_simple:
        return google_gemini_simple_folder
    elif ocr_func is ocr_google_gemini_extensive:
        return google_gemini_extensive_folder
    elif ocr_func is ocr_anthropic_simple:
        return anthropic_simple_folder
    elif ocr_func is ocr_openai_prompt_extensive:
        return anthropic_extensive_folder

## tests

In [None]:
# for ocr_func in get_all_ocr_func():
#     print("\n-----------------------------------------------------------------------")
#     print(ocr_func.__name__, "\n")
#     text = ocr_func("1915-12-28-1", "1915-12-28-2")
#     print(text[:500])

# aggregated analysis

## compare all

In [None]:
diff_name_list = ["diff_levenshtein", "diff_wer", "diff_cer", "diff_difflib"]
diff_data_columns = ["function", "image_id"] + diff_name_list
diff_data_all = []

# iterate over functions
for ocr_func in get_all_ocr_func():
    print(f"ocr_func: {ocr_func.__name__}")

    # prepare average data dict
    diff_sum_dict = {diff_name: 0 for diff_name in diff_name_list}

    # iterate over images
    image_id_list = get_image_ids()
    for image_id in image_id_list:
        print(f"image_id: {image_id}")

        # compare ground truth with inferenced text
        text_ground_truth = read_text(image_id)
        if ocr_func is ocr_openai_one_shot:
            image_id_ground_truth = None
            if image_id.endswith("1"):
                image_id_ground_truth = image_id = image_id[:-1] + "2"
            elif image_id.endswith("2"):
                image_id_ground_truth = image_id = image_id[:-1] + "1"
            elif image_id.endswith("3"):
                image_id_ground_truth = image_id = image_id[:-1] + "4"
            elif image_id.endswith("4"):
                image_id_ground_truth = image_id = image_id[:-1] + "3"
            text_inferenced = ocr_func(image_id_ground_truth, image_id)
        else:
            text_inferenced = ocr_func(image_id)
        diff_dict = diff_all(text_ground_truth, text_inferenced)

        # append results
        row_data = [ocr_func.__name__, image_id]
        for diff_name in diff_name_list:
            diff_sum_dict[diff_name] = diff_sum_dict[diff_name] + diff_dict[diff_name]
            row_data.append(diff_dict[diff_name])
        diff_data_all.append(row_data)

        # write inferenced text
        ocr_output_folder = get_ocr_output_folder(ocr_func)
        if ocr_output_folder:
            with open(ocr_output_folder / Path(image_id + ".txt"), "w") as f:
                f.write(text_inferenced)

    # append average
    row_data = [ocr_func.__name__, "average"]
    for diff_name in diff_name_list:
        row_data.append(diff_sum_dict[diff_name] / len(image_id_list))
    diff_data_all.append(row_data)

df = pd.DataFrame(diff_data_all, columns=diff_data_columns)
df.to_pickle(df_path)

## results

In [None]:
df = pd.read_pickle(df_path)
df

## plot

In [None]:
df_avg = df[df["image_id"] == "average"].copy()
function_labels = df_avg["function"].unique()
function_map = {name: i for i, name in enumerate(function_labels)}
df_avg["function_code"] = df_avg["function"].map(function_map)
metric_columns = ["diff_levenshtein", "diff_wer", "diff_cer", "diff_difflib"]
dim_list = [{"label": col, "values": df_avg[col]} for col in metric_columns]
dim_list.insert(
    0,
    {
        "label": "function",
        "values": df_avg["function_code"],
        "tickvals": list(function_map.values()),
        "ticktext": list(function_map.keys()),
    },
)
fig = go.Figure(
    data=go.Parcoords(
        line={
            "color": list(df_avg.index),
            "colorscale": "Rainbow",
        },
        dimensions=dim_list,
    )
)
fig.update_layout(
    height=700,
    font=dict(size=14),
    margin=dict(l=200),
)
fig.write_html(plot_path)
fig.show()