# Create a JSON for a doc to import into Label Studio

In [78]:
import json
import os
from typing import Any, Dict, List, Optional, Union

import cv2
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image

from mozilla_sec_eia.utils import GCSArchive

In [2]:
archive = GCSArchive()
md = archive.get_metadata()

In [30]:
md["company_name"] = md["Company Name"].str.lower().str.strip()

In [62]:
train_df = pd.read_csv("../labeled_data_tracking.csv")

In [66]:
train_df = train_df.merge(md, on=["Filename", "CIK"], how="left")

In [77]:
cache_dir = Path("../sec10k_filings")

In [72]:
train_filings = archive.get_filings(train_df, cache_directory=cache_dir)

In [98]:
pdfs_dir = cache_dir / "pdfs"
pdfs_dir.mkdir(parents=True, exist_ok=True)

In [82]:
# create PDFs
for sec_filing in train_filings:
    filename = pdfs_dir / Path(sec_filing.filename.split(".")[0] + ".pdf")
    with open(filename, "wb") as file:
        sec_filing.ex_21.save_as_pdf(file)

<table> is empty
'<c> The Southwest Companies Nevada PriMerit Bank Federally chartered stock savings bank Paiute Pipeline Company Nevada Carson Water Company Nevada Southwest Gas Transmission Company Partnership between Southwest Gas Corporation and Utility Financial Corp. Utility Financial Corp. Nevada Southwest Gas Corporation of Arizona Nevada PRIMERIT BANK SUBSIDIARIES AT DECEMBER 31, 1993'
<table> is empty
'<c> Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Gaspro, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Investments, Inc. DE Airgas Mid America, Inc. DE Airgas Mid'
<table> is empty
'<c> Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Mid America, Inc. DE Airgas Mid South, Inc

# PDF text extraction utility functions

In [74]:
# copied from well gas project wellgas/features/extract_text.py
def extract_pdf_data_from_page(page: fitz.Page) -> dict[str, pd.DataFrame]:
    """Parse PDF page data."""
    contents = _parse_page_contents(page)
    meta = {
        "rotation_degrees": [page.rotation],
        "origin_x_pdf_coord": [page.rect[0]],
        "origin_y_pdf_coord": [page.rect[1]],
        "width_pdf_coord": [page.rect[2] - page.rect[0]],
        "height_pdf_coord": [page.rect[3] - page.rect[1]],
        "has_images": [not contents["image"].empty],
        "has_text": [not contents["pdf_text"].empty],
        "page_num": [page.number],
    }
    if not contents["image"].empty:
        img_area = (
            contents["image"]
            .eval(
                "((bottom_right_x_pdf - top_left_x_pdf)"
                " * (bottom_right_y_pdf - top_left_y_pdf))"
            )
            .sum()
        )
    else:
        img_area = 0
    total_area = meta["width_pdf_coord"][0] * meta["height_pdf_coord"][0]

    meta["image_area_frac"] = [np.float32(img_area / total_area)]
    meta_df = pd.DataFrame(meta).astype(
        {
            "rotation_degrees": np.int16,
            "origin_x_pdf_coord": np.float32,
            "origin_y_pdf_coord": np.float32,
            "width_pdf_coord": np.float32,
            "height_pdf_coord": np.float32,
            "has_images": "boolean",
            "has_text": "boolean",
            "page_num": np.int16,
            "image_area_frac": np.float32,
        }
    )
    meta = dict(page=meta_df)
    for df in contents.values():  # add ID fields
        if not df.empty:
            df["page_num"] = np.int16(page.number)
    return contents | meta


def _parse_page_contents(page: fitz.Page) -> dict[str, pd.DataFrame]:
    """Parse page contents using fitz.TextPage."""
    flags = fitz.TEXTFLAGS_DICT
    # try getting only words
    textpage = page.get_textpage(flags=flags)
    content = textpage.extractDICT()
    words = textpage.extractWORDS()
    images = []
    text = []
    for block in content["blocks"]:
        if block["type"] == 0:
            # skip over text, we'll parse it by word blocks
            continue
        elif block["type"] == 1:
            images.append(_parse_image_block(block))
        else:
            raise ValueError(f"Unknown block type: {block['type']}")
    for word_block in words:
        parsed = _parse_word_block(word_block)
        if not parsed.empty:
            text.append(parsed)
    if text:
        text = pd.concat(text, axis=0, ignore_index=True)
    else:
        text = pd.DataFrame()
    if images:
        images = pd.concat(
            (pd.DataFrame(image) for image in images), axis=0, ignore_index=True
        )
    else:
        images = pd.DataFrame()
        
    return dict(pdf_text=text, image=images)


def _parse_image_block(img_block: dict[str, Any]) -> pd.DataFrame:
    """Parse an image block from a fitz.TextPage.extractDICT() output."""
    top_left_x_pdf, top_left_y_pdf, bottom_right_x_pdf, bottom_right_y_pdf = img_block[
        "bbox"
    ]
    dpi = min(
        img_block["xres"], img_block["yres"]
    )  # should be equal; min() just in case
    out = pd.DataFrame(
        {
            "img_num": [img_block["number"]],
            "dpi": [dpi],
            "top_left_x_pdf": [top_left_x_pdf],
            "top_left_y_pdf": [top_left_y_pdf],
            "bottom_right_x_pdf": [bottom_right_x_pdf],
            "bottom_right_y_pdf": [bottom_right_y_pdf],
        }
    ).astype(
        {
            "img_num": np.int16,
            "dpi": np.int16,
            "top_left_x_pdf": np.float32,
            "top_left_y_pdf": np.float32,
            "bottom_right_x_pdf": np.float32,
            "bottom_right_y_pdf": np.float32,
        }
    )
    return out

def _parse_word_block(word_block: tuple) -> pd.DataFrame:
    """Parse a word block from a fitz.TextPage.extractWORDS() output."""
    out = {
        "top_left_x_pdf": [word_block[0]],
        "top_left_y_pdf": [word_block[1]],
        "bottom_right_x_pdf": [word_block[2]],
        "bottom_right_y_pdf": [word_block[3]],
        "text": [word_block[4]],
        "block_num": [word_block[5]],
        "line_num": [word_block[6]],
        "word_num": [word_block[7]]
    }
    out = pd.DataFrame(out).astype(
        {
            "block_num": np.int16,
            "line_num": np.int16,
            "word_num": np.int16,
            "text": "string",
            "top_left_x_pdf": np.float32,
            "top_left_y_pdf": np.float32,
            "bottom_right_x_pdf": np.float32,
            "bottom_right_y_pdf": np.float32,
        }
    )
    return out

def _frac_normal_ascii(text: Union[str, bytes]) -> float:
    """Fraction of characters that are normal ASCII characters."""
    # normal characters, from space to tilde, plus whitespace
    # see https://www.asciitable.com/
    sum_ = 0
    if isinstance(text, bytes):
        text = text.decode("utf-8")
    for char in text:
        if (32 <= ord(char) <= 126) or char in "\t\n":
            sum_ += 1
    return sum_ / len(text)


In [75]:

def _render_page(
    pg: fitz.Page, dpi=150, clip: Optional[fitz.Rect] = None
) -> Image.Image:
    """Render a page of a PDF as a PIL.Image object.

    Args:
        pg (fitz.Page): a page of a PDF
        dpi (int, optional): image resolution in pixels per inch. Defaults to 150.
        clip (Optional[fitz.Rect], optional): Optionally render only a subset of the
            page. Defined in PDF coordinates. Defaults to None, which renders the
            full page.

    Returns:
        Image.Image: PDF page rendered as a PIL.Image object
    """
    # 300 dpi is what tesseract recommends. PaddleOCR seems to do fine with half that.
    render: fitz.Pixmap = pg.get_pixmap(dpi=dpi, clip=clip)  # type: ignore
    img = _pil_img_from_pixmap(render)
    return img


def _pil_img_from_pixmap(pix: fitz.Pixmap) -> Image.Image:
    """Convert pyMuPDF Pixmap object to PIL.Image object.

    For some reason pyMuPDF (aka fitz) lets you save images using PIL, but does not
    have any function to convert to PIL objects. Clearly they do this conversion
    internally; they should just expose it. Instead, I had to copy it out from their
    source code.

    Args:
        pix (fitz.Pixmap): a rendered Pixmap

    Returns:
        Image: a PIL.Image object
    """
    # pyMuPDF source code on GitHub is all in SWIG (some kind of C to python code
    # generator) and is unreadable to me. So you have to inspect your local .py files.
    # Adapted from the Pixmap.pil_save method in python3.9/site-packages/fitz/fitz.py
    # I just replaced instances of "self" with "pix"
    cspace = pix.colorspace
    if cspace is None:
        mode = "L"
    elif cspace.n == 1:
        mode = "L" if pix.alpha == 0 else "LA"
    elif cspace.n == 3:
        mode = "RGB" if pix.alpha == 0 else "RGBA"
    else:
        mode = "CMYK"

    img = Image.frombytes(mode, (pix.width, pix.height), pix.samples)
    return img

In [76]:
PDF_POINTS_PER_INCH = 72  # I believe this is standard for all PDFs

def pil_to_cv2(image: Image.Image) -> np.ndarray:  # noqa: C901
    """Convert a PIL Image to an OpenCV image (numpy array)."""
    # copied from https://gist.github.com/panzi/1ceac1cb30bb6b3450aa5227c02eedd3
    # This covers the common modes, is not exhaustive.
    mode = image.mode
    new_image: np.ndarray
    if mode == "1":
        new_image = np.array(image, dtype=np.uint8)
        new_image *= 255
    elif mode == "L":
        new_image = np.array(image, dtype=np.uint8)
    elif mode == "LA" or mode == "La":
        new_image = np.array(image.convert("RGBA"), dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2BGRA)
    elif mode == "RGB":
        new_image = np.array(image, dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
    elif mode == "RGBA":
        new_image = np.array(image, dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2BGRA)
    elif mode == "LAB":
        new_image = np.array(image, dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_LAB2BGR)
    elif mode == "HSV":
        new_image = np.array(image, dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_HSV2BGR)
    elif mode == "YCbCr":
        # XXX: not sure if YCbCr == YCrCb
        new_image = np.array(image, dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_YCrCb2BGR)
    elif mode == "P" or mode == "CMYK":
        new_image = np.array(image.convert("RGB"), dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
    elif mode == "PA" or mode == "Pa":
        new_image = np.array(image.convert("RGBA"), dtype=np.uint8)
        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2BGRA)
    else:
        raise ValueError(f"unhandled image color mode: {mode}")

    return new_image


def cv2_to_pil(img: np.ndarray) -> Image.Image:
    """Create PIL Image from numpy pixel array."""
    if len(img.shape) == 2:  # single channel, AKA grayscale
        return Image.fromarray(img)
    else:  # only handle BGR for now
        return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))


def display_img_array(img: np.ndarray, figsize=(5, 5), **kwargs):
    """Plot image array for jupyter sessions."""
    plt.figure(figsize=figsize)
    if len(img.shape) == 2:  # grayscale
        return plt.imshow(img, cmap="gray", vmin=0, vmax=255, **kwargs)
    else:
        return plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), **kwargs)


def overlay_bboxes(
    img: np.ndarray, bboxes: np.ndarray, color=(255, 0, 0)
) -> np.ndarray:
    """Overlay bounding boxes of shape N x 4 (x0, y0, x1, y1) on an image."""
    img = img.copy()
    for box in np.round(bboxes, 0).astype(np.int32):  # float to int just in case:
        x0, y0, x1, y1 = box
        cv2.rectangle(img, (x0, y0), (x1, y1), color=color, thickness=1)
    return img


def pdf_coords_to_pixel_coords(coords: np.ndarray, dpi: int) -> np.ndarray:
    """Convert PDF coordinates to pixel coordinates."""
    # For arbitrary PDFs you would need to subtract the origin in PDF coordinates,
    # but since you create these PDFs, you know the origin is (0, 0).
    out = coords * dpi / PDF_POINTS_PER_INCH
    return out

# Read in one doc and create a JSON

In [98]:
# UPDATE THIS
pdf_filename = "wisconsin_electric.pdf"

In [99]:
src_path = Path(pdf_filename)
assert src_path.exists()

In [100]:
# from file
doc = fitz.Document(str(src_path))
doc.is_pdf

True

In [101]:
# from bytes
_bytes = src_path.read_bytes()
from io import BytesIO
doc = fitz.open(stream=BytesIO(_bytes), filetype="pdf")
doc.is_pdf

True

### Extract Text Bboxes

In [102]:
pg = doc[0]
extracted = extract_pdf_data_from_page(pg)
extracted.keys()

dict_keys(['pdf_text', 'image', 'page'])

In [103]:
txt = extracted['pdf_text']
img_info = extracted['image']
pg_meta = extracted['page']
txt.shape, img_info.shape, pg_meta.shape

((106, 9), (0, 0), (1, 9))

In [104]:
txt

Unnamed: 0,top_left_x_pdf,top_left_y_pdf,bottom_right_x_pdf,bottom_right_y_pdf,text,block_num,line_num,word_num,page_num
0,504.791168,83.474442,541.465210,98.621445,Exhibit,0,0,0,0
1,544.523193,83.474442,565.929260,98.621445,21.1,0,0,1,0
2,184.882263,127.074478,248.440277,142.221481,WISCONSIN,1,0,0,0
3,251.498276,127.074478,306.498260,142.221481,ELECTRIC,1,0,1,0
4,309.556274,127.074478,351.114288,142.221481,POWER,1,0,2,0
...,...,...,...,...,...,...,...,...,...
101,167.466446,432.646484,212.476440,446.386475,subsidiary,10,2,5,0
102,215.256439,432.646484,223.596436,446.386475,of,10,2,6,0
103,226.376434,432.646484,249.706451,446.386475,WEC,10,2,7,0
104,252.486450,432.646484,284.166443,446.386475,Energy,10,2,8,0


In [105]:
full_pg_img = _render_page(pg)

In [103]:
image_filename = "wisconsin_electric.jpg"

In [107]:
full_pg_img.save(image_filename)

## Define page variables and JSON dict

In [35]:
original_width = pil_to_cv2(full_pg_img).shape[1]
original_height = pil_to_cv2(full_pg_img).shape[0]

In [37]:
x_norm = 100/pg_meta.width_pdf_coord.iloc[0]
y_norm = 100/pg_meta.height_pdf_coord.iloc[0]
x_norm, y_norm

(0.16798942273629794, 0.11878039560889525)

In [104]:
annotation_json = {
    "data": {
        "ocr": f"gs://labeled-ex21-filings/{image_filename}"  # how do we get the image name?
    },
    "annotations": [],
    "predictions": [{"model_version": "v1.0", "result": []}],
}

## Create a bounding box result entry for each word

In [159]:
def get_bbox_dicts(bbox: pd.Series, ind) -> List[Dict]:
    x = bbox["top_left_x_pdf"] * x_norm
    y = bbox["top_left_y_pdf"] * y_norm
    width = (bbox["bottom_right_x_pdf"] - bbox["top_left_x_pdf"]) * x_norm
    height = (bbox["bottom_right_y_pdf"] - bbox["top_left_y_pdf"]) * y_norm
    print(x, y, width, height)
    word = bbox["text"]
    bbox_id = f"bbox_{ind}"
    box_dict = {
        "original_width": original_width,
        "original_height": original_height,
        "image_rotation": 0,
        "value": {
            "x": x,
            "y": y,
            "width": width,
            "height": height,
            "rotation": 0
        },
        "id": bbox_id,
        "from_name": "bbox",
        "to_name": "image",
        "type": "rectangle",
        "origin": "manual"
    }
    word_dict = {
        "original_width": original_width,
        "original_height": original_height,
        "image_rotation": 0,
        "value": {
            "x": x,
            "y": y,
            "width": width,
            "height": height,
            "rotation": 0,
            "text": [word]
        },
        "id": bbox_id,
        "from_name": "transcription",
        "to_name": "image",
        "type": "textarea",
        "origin": "manual"
    }
    return [box_dict, word_dict]

In [122]:
result = []
# change to using an apply?
for i, row in txt.iterrows():
    result += get_bbox_dicts(row, i)

In [123]:
annotation_json["predictions"][0]["result"] = result

In [124]:
json_filename = "wisconsin_electric_full.json"

In [125]:
with open(json_filename, 'w') as fp:
    json.dump(annotation_json, fp)

# Create JSONs and images for entire training set

In [89]:
image_dir = cache_dir / "images"
image_dir.mkdir(parents=True, exist_ok=True)

In [90]:
json_dir = cache_dir / "jsons"
json_dir.mkdir(parents=True, exist_ok=True)

In [118]:
for pdf_filename in os.listdir(pdfs_dir):
    print(f"Creating JSON for {pdf_filename}")
    src_path = pdfs_dir / pdf_filename
    assert src_path.exists()
    # from file
    doc = fitz.Document(str(src_path))
    assert doc.is_pdf
    pg = doc[0]
    extracted = extract_pdf_data_from_page(pg)
    txt = extracted['pdf_text']
    img_info = extracted['image']
    pg_meta = extracted['page']
    # render an image of the page and save
    # what happens when there are multiple pages?
    # might need to use util function
    full_pg_img = _render_page(pg)
    image_filename = pdf_filename.split(".")[0] + ".png"
    full_pg_img.save(image_dir / image_filename)
    # fill in some basic variables
    original_width = pil_to_cv2(full_pg_img).shape[1]
    original_height = pil_to_cv2(full_pg_img).shape[0]
    x_norm = 100/pg_meta.width_pdf_coord.iloc[0]
    y_norm = 100/pg_meta.height_pdf_coord.iloc[0]
    # base annotation JSON template
    filename_no_ext = pdf_filename.split(".")[0]
    annotation_json = {
        "id": f"{filename_no_ext}",
        "data": {
        "ocr": f"gs://labeled-ex21-filings/unlabeled/{image_filename}" 
        },
        "annotations": [],
        "predictions": [{"model_version": "v1.0", "result": []}],
    }
    result = []
    # change to using an apply?
    for i, row in txt.iterrows():
        result += get_bbox_dicts(row, i)

    annotation_json["predictions"][0]["result"] = result
    json_filename = json_dir / Path(filename_no_ext + ".json")
    with open(json_filename, 'w') as fp:
        json.dump(annotation_json, fp)

Creating JSON for 107815-2017q1-107815-0000107815-17-000106.pdf
Creating JSON for 1158059-2005q2-1158059-0000893220-05-001394.pdf
Creating JSON for 92487-2021q1-92487-0000004904-21-000010.pdf
Creating JSON for 38725-2017q1-38725-0000038725-17-000042.pdf
Creating JSON for 18647-2008q1-18647-0001169232-08-000603.pdf
Creating JSON for 916529-2003q1-916529-0001144204-03-001333.pdf
Creating JSON for 1170154-2011q1-1170154-0001193125-11-062378.pdf
Creating JSON for 59527-2020q1-59527-0000059527-20-000007.pdf
Creating JSON for 1012493-1999q4-1012493-0000922358-99-000021.pdf
Creating JSON for 4904-2009q1-4904-0000004904-09-000040.pdf
Creating JSON for 1223037-2009q4-1223037-0001193125-09-249998.pdf
Creating JSON for 1166847-2012q1-1166847-0001117768-12-000118.pdf
Creating JSON for 922237-1999q4-922237-0000950005-99-000915.pdf
Creating JSON for 60549-1998q1-60549-0001047469-98-012481.pdf
Creating JSON for 922358-2006q4-922358-0000950134-06-018966.pdf
Creating JSON for 100122-2003q1-100122-00009

# Format LS output JSON into pandas dataframe

In [212]:
json_file_path = "test.json"

with open(json_file_path, 'r') as j:
     doc_dict = json.loads(j.read())

In [233]:
# do something like get the filename of the result, then read in the data for that doc
doc_dict["task"]["data"]["ocr"]

'gs://labeled-ex21-filings/unlabeled/107815-2017q1-107815-0000107815-17-000106.png'

In [122]:
pdf_filename = "107815-2017q1-107815-0000107815-17-000106.pdf"

In [267]:
src_path = pdfs_dir / pdf_filename
assert src_path.exists()
# from file
doc = fitz.Document(str(src_path))
assert doc.is_pdf
pg = doc[0]
extracted = extract_pdf_data_from_page(pg)
txt = extracted['pdf_text']
img_info = extracted['image']
pg_meta = extracted['page']
full_pg_img = _render_page(pg)
original_width = pil_to_cv2(full_pg_img).shape[1]
original_height = pil_to_cv2(full_pg_img).shape[0]
x_norm = 100/pg_meta.width_pdf_coord.iloc[0]
y_norm = 100/pg_meta.height_pdf_coord.iloc[0]

In [263]:
# merge the labels onto the txt dataframe
# do this by creating a dataframe with a row for each word
doc_df = pd.DataFrame()
for item in doc_dict["result"]:
    value = item["value"]
    ind = int(item["id"].split("_")[-1])
    doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])
doc_df = doc_df.groupby(level=0).first()

In [268]:
output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)

In [271]:
output_df[output_df.labels.notna()]

Unnamed: 0,top_left_x_pdf,top_left_y_pdf,bottom_right_x_pdf,bottom_right_y_pdf,text,block_num,line_num,word_num,page_num,labels
62,31.34646,349.576477,51.346462,363.316467,ATC,7,0,0,0,B-Subsidiary
63,54.126465,349.576477,115.266464,363.316467,"Management,",7,0,1,0,I-Subsidiary
64,118.046463,349.576477,134.166458,363.316467,Inc.,7,0,2,0,I-Subsidiary
66,369.109741,349.576477,414.669739,363.316467,Wisconsin,7,2,0,0,B-Loc
67,493.422943,349.576477,527.332947,363.316467,26.24%,7,4,0,0,B-Own_Per
68,31.34646,365.576477,73.576462,379.316467,American,8,0,0,0,B-Subsidiary
69,76.356461,365.576477,135.806473,379.316467,Transmission,8,0,1,0,I-Subsidiary
70,138.586472,365.576477,181.376465,379.316467,Company,8,0,2,0,I-Subsidiary
71,184.156464,365.576477,202.49646,379.316467,LLC,8,0,3,0,I-Subsidiary
73,369.109741,365.576477,414.669739,379.316467,Wisconsin,8,2,0,0,B-Loc


In [272]:
# refer to notebook 5 for remaining steps to get into fine-tuned format for LayoutLM