# Create a JSON for a doc to import into Label Studio

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os
from pathlib import Path

import fitz  # PyMuPDF
import numpy as np
import pandas as pd

from mozilla_sec_eia.utils.cloud import GCSArchive
from mozilla_sec_eia.utils.pdf import (
    combine_doc_pages,
    extract_pdf_data_from_page,
    get_label_studio_bbox_dicts,
    pil_to_cv2,
    render_page,
)

In [None]:
archive = GCSArchive()
md = archive.get_metadata()

In [None]:
train_df = pd.read_csv("../labeled_data_tracking.csv")

In [None]:
keywords = ["utility", "electric", "gas", "energy", "transmission"]
reg = "|".join(keywords)

In [None]:
new_energy_docs = md[(md["company_name"].str.lower().str.contains(reg, regex=True))
                     & (~md.exhibit_21_version.isnull())
                    ].sample(30)

In [None]:
new_docs = md[~md.exhibit_21_version.isnull()].sample(25)

In [None]:
new_energy_docs = new_energy_docs[~new_energy_docs.filename.isin(train_df["CIK"])]

In [None]:
new_docs = new_docs[~(new_docs.filename.isin(train_df["CIK"])) & ~(new_docs.filename.isin(new_energy_docs.cik))]

In [None]:
len(new_docs)

In [None]:
len(new_energy_docs)

In [None]:
new_docs = pd.concat([new_docs, new_energy_docs])

In [None]:
new_docs[["cik", "filename"]].to_csv("../labeled_data_tracking.csv", mode="a", index=False, header=False)

In [None]:
train_df = pd.read_csv("../labeled_data_tracking.csv")

In [None]:
train_df[train_df.CIK.duplicated(keep=False)].sort_values(by="CIK")

In [None]:
train_df = train_df.merge(md, left_on=["Filename", "CIK"], right_on=["filename", "cik"], how="left")

In [None]:
cache_dir = Path("../sec10k_filings")

In [None]:
unlabeled = train_df[train_df["Initials"].isnull()]

In [None]:
train_filings = archive.get_filings(unlabeled, cache_directory=cache_dir / "htmls")

In [None]:
pdfs_dir = cache_dir / "new_pdfs"
pdfs_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# create PDFs
for sec_filing in train_filings:
    full_filename = sec_filing.filename
    pdf_filename = archive.get_local_filename(cache_directory=pdfs_dir,
                                              filing=train_df[train_df["filename"] == full_filename].iloc[0],
                                              extension=".pdf"
                                             )
    with open(pdf_filename, "wb") as file:
        sec_filing.ex_21.save_as_pdf(file)

# Read in one doc and create a JSON

In [None]:
# UPDATE THIS
# pdf_filename = "wisconsin_electric.pdf"
# pdf_filename = "../sec10k_filings/pdfs/922358-2006q4-922358-0000950134-06-018966.pdf"
pdf_filename = "../sec10k_filings/pdfs/4904-2009q1-4904-0000004904-09-000040.pdf"
# pdf_filename = "../sec10k_filings/pdfs/3146-2006q1-3146-0001193125-06-055140.pdf"

In [None]:
src_path = Path(pdf_filename)
assert src_path.exists()

In [None]:
# from file
doc = fitz.Document(str(src_path))
doc.is_pdf

In [None]:
# from bytes
_bytes = src_path.read_bytes()
from io import BytesIO

doc = fitz.open(stream=BytesIO(_bytes), filetype="pdf")
doc.is_pdf

### Extract Text Bboxes

In [None]:
len(doc)

In [None]:
# pg = doc[1]
pg = combine_doc_pages(doc)
extracted = extract_pdf_data_from_page(pg)
extracted.keys()

In [None]:
txt = extracted["pdf_text"]
img_info = extracted["image"]
pg_meta = extracted["page"]
txt.shape, img_info.shape, pg_meta.shape

In [None]:
full_pg_img = render_page(pg)

In [None]:
full_pg_img

In [None]:
image_filename = "wisconsin_electric.png"

In [None]:
full_pg_img.save(image_filename)

## Define page variables and JSON dict

In [None]:
original_width = pil_to_cv2(full_pg_img).shape[1]
original_height = pil_to_cv2(full_pg_img).shape[0]

In [None]:
x_norm = 100/pg_meta.width_pdf_coord.iloc[0]
y_norm = 100/pg_meta.height_pdf_coord.iloc[0]
x_norm, y_norm

In [None]:
image_filename = "922358-2006q4-922358-0000950134-06-018966.png"

In [None]:
annotation_json = {
    "data": {
        "ocr": f"gs://labeled-ex21-filings/{image_filename}"
    },
    "annotations": [],
    "predictions": [{"model_version": "v1.0", "result": []}],
}

## Create a bounding box result entry for each word

In [None]:
result = []
# change to using an apply?
for i, row in txt.iterrows():
    result += get_label_studio_bbox_dicts(row, i, x_norm, y_norm, original_width, original_height)

In [None]:
annotation_json["predictions"][0]["result"] = result

In [None]:
json_filename = "wisconsin_electric_full.json"

In [None]:
with open(json_filename, "w") as fp:
    json.dump(annotation_json, fp)

# Create JSONs and images for entire training set

In [None]:
image_dir = cache_dir / "images"
image_dir.mkdir(parents=True, exist_ok=True)

In [None]:
json_dir = cache_dir / "jsons"
json_dir.mkdir(parents=True, exist_ok=True)

In [None]:
for pdf_filename in os.listdir(pdfs_dir):
    if pdf_filename.split(".")[-1] != "pdf":
        continue
    print(f"Creating JSON for {pdf_filename}")
    src_path = pdfs_dir / pdf_filename
    assert src_path.exists()
    # from file
    doc = fitz.Document(str(src_path))
    assert doc.is_pdf
    pg = doc[0]
    extracted = extract_pdf_data_from_page(pg)
    txt = extracted["pdf_text"]
    img_info = extracted["image"]
    pg_meta = extracted["page"]
    # render an image of the page and save
    # what happens when there are multiple pages?
    # might need to use util function
    full_pg_img = render_page(pg)
    image_filename = pdf_filename.split(".")[0] + ".png"
    full_pg_img.save(image_dir / image_filename)
    # fill in some basic variables
    original_width = pil_to_cv2(full_pg_img).shape[1]
    original_height = pil_to_cv2(full_pg_img).shape[0]
    x_norm = 100/pg_meta.width_pdf_coord.iloc[0]
    y_norm = 100/pg_meta.height_pdf_coord.iloc[0]
    # base annotation JSON template
    filename_no_ext = pdf_filename.split(".")[0]
    annotation_json = {
        "id": f"{filename_no_ext}",
        "data": {
        "ocr": f"gs://labeled-ex21-filings/unlabeled/{image_filename}"
        },
        "annotations": [],
        "predictions": [{"model_version": "v1.0", "result": []}],
    }
    result = []
    # change to using an apply?
    for i, row in txt.iterrows():
        result += get_label_studio_bbox_dicts(row, i, x_norm, y_norm, original_width, original_height)

    annotation_json["predictions"][0]["result"] = result
    json_filename = json_dir / Path(filename_no_ext + ".json")
    with open(json_filename, "w") as fp:
        json.dump(annotation_json, fp)

# Format LS output JSON into pandas dataframe

In [None]:
labeled_json_dir = Path("../sec10k_filings/labeled_jsons")

In [None]:
def is_cik_in_training_data(labeled_json_filename):
    # for now CIK is stored as an int
    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
    return cik in train_df.CIK.unique()

In [None]:
labeled_df = pd.DataFrame()
image_dict = {}
for json_filename in os.listdir(labeled_json_dir):
    json_file_path = labeled_json_dir / json_filename
    with open(json_file_path) as j:
        doc_dict = json.loads(j.read())
        filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
        print(filename)
        if not is_cik_in_training_data(filename):
            continue
        pdf_filename = filename + ".pdf"
        src_path = pdfs_dir / pdf_filename
        assert src_path.exists()
        # from file
        doc = fitz.Document(str(src_path))
        assert doc.is_pdf
        pg = doc[0]
        extracted = extract_pdf_data_from_page(pg)
        txt = extracted["pdf_text"]
        img_info = extracted["image"]
        pg_meta = extracted["page"]
        full_pg_img = render_page(pg)
        # normalize bboxes between 0 and 1000 for Hugging Face
        txt["top_left_x_pdf"] = txt["top_left_x_pdf"]/pg_meta.width_pdf_coord.iloc[0]*1000
        txt["top_left_y_pdf"] = txt["top_left_y_pdf"]/pg_meta.height_pdf_coord.iloc[0]*1000
        txt["bottom_right_x_pdf"] = txt["bottom_right_x_pdf"]/pg_meta.width_pdf_coord.iloc[0]*1000
        txt["bottom_right_y_pdf"] = txt["bottom_right_y_pdf"]/pg_meta.height_pdf_coord.iloc[0]*1000
        pg_meta.height_pdf_coord.iloc[0]
        doc_df = pd.DataFrame()
        for item in doc_dict["result"]:
            value = item["value"]
            # sometimes Label Studio will fill in an empty list as a label
            # when there is really no label
            # do this without dict comprehension?
            if ("labels" in value) and value["labels"] == []:
                value = {k: v for k, v in value.items() if k != "labels"}
            ind = int(item["id"].split("_")[-1])
            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])
        doc_df = doc_df.groupby(level=0).first()
        txt.loc[:, "id"] = filename
        output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)
        labeled_df = pd.concat([labeled_df, output_df])
        image_dict[filename] = full_pg_img
labeled_df["labels"] = labeled_df["labels"].fillna("O")
labeled_df = labeled_df.rename(columns={"labels": "ner_tag"})

In [None]:
# reorganize columns in labeled_df
non_id_columns = [col for col in labeled_df.columns if col != "id"]
labeled_df = labeled_df.loc[:, ["id"] + non_id_columns]

In [None]:
# sanity check the bboxes
labeled_df.top_left_x_pdf.max(), labeled_df.top_left_y_pdf.max(), labeled_df.bottom_right_x_pdf.max(), labeled_df.bottom_right_y_pdf.max()

In [None]:
# sanity check
labeled_df.top_left_x_pdf.min(), labeled_df.top_left_y_pdf.min(), labeled_df.bottom_right_x_pdf.min(), labeled_df.bottom_right_y_pdf.min()

In [None]:
labeled_df.to_parquet("labeled_df.parquet")

In [None]:
def get_image_dict(labeled_df):
    image_dict = {}
    for filename in labeled_df["id"].unique():
        continue
        # read in image from cached images as PIL and save with key as id
    return image_dict

# Fine-tune LayoutLM on the labeled data

In [None]:
import torch
from datasets import (
    Array2D,
    Array3D,
    ClassLabel,
    Dataset,
    Features,
    Sequence,
    Value,
    load_metric,
)
from transformers import AutoProcessor

In [None]:
bbox_cols = ["top_left_x_pdf", "top_left_y_pdf", "bottom_right_x_pdf", "bottom_right_y_pdf"]

In [None]:
# convert dataframe/dictionary into NER format
# document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py
# complete dataset is a list of dicts, with one dict for each doc
doc_filenames = labeled_df["id"].unique()
ner_annotations = []
for filename in doc_filenames:
    annotation = {
        "id": filename,
        "tokens": labeled_df.groupby("id")["text"].apply(list).loc[filename],
        "ner_tags": labeled_df.groupby("id")["ner_tag"].apply(list).loc[filename],
        "bboxes": labeled_df.loc[labeled_df["id"] == filename, :][bbox_cols].values.tolist(),
        "image": image_dict[filename]
    }
    ner_annotations.append(annotation)

In [None]:
len(ner_annotations)

In [None]:
# use the DocAI normalizer or is everything okay as is?

In [None]:
dataset = Dataset.from_list(ner_annotations)

In [None]:
dataset

In [None]:
label_list = ["O", "B-Subsidiary", "I-Subsidiary", "B-Loc", "I-Loc", "B-Own_Per"]
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}
column_names = dataset.column_names
label_list

In [None]:
class_label = ClassLabel(names=label_list)

In [None]:
dataset = dataset.train_test_split(test_size=0.15)

In [None]:
# processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base-uncased")
# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

def convert_ner_tags_to_id(ner_tags):
    return [int(label2id[ner_tag]) for ner_tag in ner_tags]

# This function is used to put the Dataset in its final format for training LayoutLM
def prepare_dataset(annotations):
    images = annotations["image"]
    words = annotations["tokens"]
    boxes = annotations["bboxes"]
    # Map over labels and convert to numeric id for each ner_tag
    ner_tags = [convert_ner_tags_to_id(ner_tags) for ner_tags in annotations["ner_tags"]]

    encoding = processor(images, words, boxes=boxes, word_labels=ner_tags, truncation=True, padding="max_length")

    return encoding

In [None]:
# Define features for use training the model
features = Features({
    "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
    "input_ids": Sequence(feature=Value(dtype="int64")),
    "attention_mask": Sequence(Value(dtype="int64")),
    "bbox": Array2D(dtype="int64", shape=(512, 4)),
    "labels": Sequence(feature=Value(dtype="int64")),
})

# Prepare our train & eval dataset

train_dataset = dataset["train"].map(
    prepare_dataset,
    batched=True,
    remove_columns=column_names,
    features=features,
)

eval_dataset = dataset["test"].map(
    prepare_dataset,
    batched=True,
    remove_columns=column_names,
    features=features,
)

In [None]:
example = train_dataset[0]
processor.tokenizer.decode(example["input_ids"])

In [None]:
train_dataset.set_format("torch")

In [None]:
example = train_dataset[0]
for k,v in example.items():
    print(k,v.shape)

In [None]:
processor.tokenizer.decode(eval_dataset[0]["input_ids"])

In [None]:
!pip install -q datasets seqeval

In [None]:
metric = load_metric("seqeval")

In [None]:
return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import LayoutLMv3ForTokenClassification

model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="test",
                                  max_steps=1000,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [None]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model_path = "../models/layoutlm_v1_50_labeled_docs"

In [None]:
# use model = LayoutLMv3ForTokenClassification.from_pretrained({path}) to load
trainer.save_model(model_path)

In [None]:
trainer.evaluate()

# Perform Inference on a Test Example

In [None]:
def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

In [None]:
"""
Some simple utilities for drawing bboxes on images of Driver's Licenses
"""
from PIL import ImageDraw, ImageFont

font = ImageFont.load_default()

def iob_to_label(label):
    label = label[2:]
    if not label:
        return "other"
    return label

def draw_boxes_on_img(
    preds_or_labels,
    boxes,
    draw,
    image,
    unnormalize = False
):
    label_color_lookup = {
        "subsidiary": "green",
        "loc": "red",
        "own_per": "orange",
    }
    for pred_or_label, box in zip(preds_or_labels, boxes):
        label = iob_to_label(pred_or_label).lower()
        if label == "other":
            continue
        if unnormalize:
            box = unnormalize_box(box, width, height)
        color = label_color_lookup[label]
        draw.rectangle(box, outline=color)
        draw.text((box[0] + 10, box[1] - 10), text=label, fill=color, font=font)

In [None]:
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path,
                                                         id2label=id2label,
                                                         label2id=label2id)

In [None]:
example = dataset["test"][0]
example.keys()

In [None]:
image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
ner_tags = convert_ner_tags_to_id(example["ner_tags"])

encoding = processor(image, words, boxes=boxes, word_labels=ner_tags, return_tensors="pt")
for k,v in encoding.items():
    print(k,v.shape)

In [None]:
encoding["input_ids"] = encoding["input_ids"].to(torch.int64)
encoding["attention_mask"] = encoding["attention_mask"].to(torch.int64)
encoding["labels"] = encoding["labels"].to(torch.int64)
encoding["bbox"] = encoding["bbox"].to(torch.int64)

In [None]:
if torch.cuda.is_available():
    encoding.to("cuda")
    model.to("cuda")

Next, we do a forward pass. We use torch.no_grad() as we don't require gradient computation.

In [None]:
with torch.no_grad():
    outputs = model(**encoding)

In [None]:
logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()
labels = encoding.labels.squeeze().tolist()

In [None]:
token_boxes = encoding.bbox.squeeze().tolist()
width, height = image.size

true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]

In [None]:

"""
Draw predictions
"""
image = example["image"]
image = image.convert("RGB")

draw = ImageDraw.Draw(image)

draw_boxes_on_img(true_predictions, true_boxes, draw, image)
image