# Exploratory Data Analysis of the OCR Berrutti Dataset

In [1]:
from pathlib import Path
import json
from typing import Any, Dict, List

import numpy as np
import pandas as pd
from PIL import Image


class OCRBerruttiDataset:
    """
    OCR Berrutti Dataset loader.
    The dataset contains three folders:
        - images: document images
        - bounding_boxes_lines: JSON with line bounding boxes
        - full_text_transcription: text files with full transcriptions
    """

    def __init__(self, path: Path) -> None:
        """
        Initialize the dataset loader.

        Args:
            path (Path): Path to the dataset directory.
        """
        self.path: Path = path
        self.images_dir: Path = self.path / "images"
        self.boxes_json: Path = (
            self.path / "bounding_boxes_lines" / "bounding_boxes_lines_via2.json"
        )
        self.transcriptions_dir: Path = self.path / "full_text_transcription"

    def get_docs_list(self) -> List[str]:
        """
        Return a list of document names (without extension).

        Returns:
            List[str]: Document stems.
        """
        return [
            p.stem
            for p in self.images_dir.iterdir()
            if p.is_file() and p.suffix.lower() == ".png"
        ]

    def get_bounding_boxes(self, doc_name: str) -> pd.DataFrame:
        """
        Load bounding boxes for a document, including the image crop as a NumPy array.

        Args:
            doc_name (str): Document name (without extension).

        Returns:
            pd.DataFrame: DataFrame with columns x, y, width, height, text, crop.
        """
        with open(self.boxes_json, "r") as f:
            data: Dict[str, Any] = json.load(f)
        regions: List[Dict[str, Any]] = data[doc_name]["regions"]

        image_path = self.images_dir / f"{doc_name}.png"
        img_array: np.ndarray = np.array(Image.open(image_path))

        records: List[Dict[str, Any]] = []
        for region in regions:
            x = region["shape_attributes"]["x"]
            y = region["shape_attributes"]["y"]
            w = region["shape_attributes"]["width"]
            h = region["shape_attributes"]["height"]
            text = region["region_attributes"]["text"]
            crop = img_array[y : y + h, x : x + w]
            records.append(
                {"x": x, "y": y, "width": w, "height": h, "text": text, "crop": crop}
            )

        return pd.DataFrame(records)

    def plot_document(self, doc_name: str, show_boxes: bool = True) -> None:
        """
        Display the document image with optional bounding boxes overlay.

        Args:
            doc_name (str): Document name (without extension).
            show_boxes (bool): If True, draw bounding boxes with hover text.
        """
        import plotly.graph_objects as go

        image_path = self.images_dir / f"{doc_name}.png"
        img = Image.open(image_path)
        width, height = img.size

        fig = go.Figure()
        fig.add_layout_image(
            dict(
                source=img,
                x=0,
                y=0,
                sizex=width,
                sizey=height,
                xref="x",
                yref="y",
                sizing="stretch",
                layer="below",
            )
        )

        if show_boxes:
            df = self.get_bounding_boxes(doc_name)
            for _, row in df.iterrows():
                fig.add_shape(
                    type="rect",
                    x0=row["x"],
                    y0=row["y"],
                    x1=row["x"] + row["width"],
                    y1=row["y"] + row["height"],
                    xref="x",
                    yref="y",
                    line=dict(color="red", width=2),
                )
                fig.add_trace(
                    go.Scatter(
                        x=[
                            row["x"],
                            row["x"] + row["width"],
                            row["x"] + row["width"],
                            row["x"],
                            row["x"],
                        ],
                        y=[
                            row["y"],
                            row["y"],
                            row["y"] + row["height"],
                            row["y"] + row["height"],
                            row["y"],
                        ],
                        mode="none",
                        fill="toself",
                        fillcolor="rgba(0,0,0,0)",
                        hoverinfo="text",
                        text=row["text"],
                        showlegend=False,
                    )
                )

        fig.update_xaxes(visible=False, range=[0, width], constrain="domain")
        fig.update_yaxes(visible=False, range=[height, 0], scaleanchor="x")
        fig.update_layout(
            title=dict(text=f"Detections: {doc_name}", x=0.5),
            margin=dict(l=0, r=0, t=30, b=0),
            dragmode="zoom",
            showlegend=False,
        )
        fig.show()


ocr_berrutti_dataset = OCRBerruttiDataset(Path("berrutti_dataset"))

### Show list of documents

In [2]:
from pprint import pprint

pprint(ocr_berrutti_dataset.get_docs_list())

['medium_r0547_1444',
 'high_r0602_2316',
 'high_r0814_1006',
 'high_r1060-071',
 'medium_r0463_0408',
 'high_r0814_0325',
 'high_r0602_2317',
 'high_r0795_0638',
 'high_r1291_0465',
 'high_r0795_0148',
 'low_r1060-155',
 'high_r0814_1005',
 'medium_r0463_0393',
 'medium_r0114_0208',
 'low_r0606_2299',
 'medium_r0114_0209',
 'medium_r0463_0392',
 'low_r1060-168',
 'medium_r1060-190',
 'high_r0791_1063',
 'high_r0126_0317',
 'high_r0795_1938',
 'high_r0828_2229',
 'medium_r1060-180',
 'medium_r0114_0231',
 'high_r0798_0329',
 'high_r0952_0233',
 'medium_r0114_0218',
 'high_r0955_2440',
 'low_r1060-145',
 'low_r0726_0260',
 'low_r1060-147',
 'medium_r1060-020',
 'high_r0828_2410',
 'low_r1060-184',
 'medium_r0114_0226',
 'medium_r0114_0232',
 'low_r0661_1680',
 'low_r0438_2571',
 'low_r0341_0080',
 'medium_r0139_0955',
 'medium_r0463_0394',
 'high_r0813_0153',
 'medium_r0547_1427',
 'high_r0792_1086',
 'high_r0791_2047',
 'medium_r0580_2249',
 'low_r629-1710.1975',
 'low_r0863_0808',
 'l

### Bounding boxes of one document

In [3]:
doc = ocr_berrutti_dataset.get_docs_list()[0]
print(doc)
bounding_boxes = ocr_berrutti_dataset.get_bounding_boxes(doc)
bounding_boxes

medium_r0547_1444


Unnamed: 0,x,y,width,height,text,crop
0,1572,736,459,57,R E S E R V A D O,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
1,1675,804,183,62,27 / I,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
2,1177,933,1655,60,"- Por otra parte, se han producido hechos de s...","[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
3,1173,989,1669,60,escenario político y social que podrían tener ...,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
4,1175,1038,394,45,en el futuro.,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
5,1177,1091,1691,56,"En el campo opositor, Rodo fo SEGUEL, líder de...","[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
6,1178,1139,1657,55,que están dispuestos a retomar la movilización...,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
7,1171,1191,1667,52,los analistas chilenos que la organización lab...,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
8,1171,1242,1669,57,dora de las masivas protestas nacionales del a...,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."
9,1170,1294,1499,47,"capacidad para aunar a la disidencia es ""incue...","[[255, 255, 255, 255, 255, 255, 255, 255, 255,..."


### Plotting the bounding boxes

In [4]:
ocr_berrutti_dataset.plot_document(doc)