In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
from pathlib import Path

import pandas as pd
from transformers import AutoProcessor, LayoutLMv3ForTokenClassification
import torch
import numpy as np
from scipy.stats import mode
 
from mozilla_sec_eia.ex_21.inference import (
    clean_extracted_df,
    create_inference_dataset,
    perform_inference
)
from mozilla_sec_eia.utils.layoutlm import (
    iob_to_label,
    draw_boxes_on_img,
    unnormalize_box
)

# Extract into a table

In [None]:
has_labels=False

In [None]:
label_list = ['O', 'B-Subsidiary', 'I-Subsidiary', 'B-Loc', 'I-Loc', 'B-Own_Per', 'I-Own_Per']
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}

In [None]:
# load_model function loads a model with mlflow
model_path = Path("../models/layoutlm_v1_50_labeled_docs")

In [None]:
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path,
                                                         id2label=id2label,
                                                         label2id=label2id)

In [None]:
processor = AutoProcessor.from_pretrained(
        "microsoft/layoutlmv3-base", apply_ocr=False
    )

In [None]:
pdf_dir = Path("../sec10k_filings/pdfs")

In [None]:
# only necessary if using data with labels
labeled_json_dir = Path("../sec10k_filings/labeled_jsons_v0.1")

In [None]:
dataset = create_inference_dataset(
        pdfs_dir=pdf_dir, labeled_json_dir=labeled_json_dir, has_labels=has_labels
    )

In [None]:
# only use 3 examples
dataset_index = [0, 1, 2]

In [None]:
# check but I think this is mainly slow because it's checking to make sure PDFs and JSONs are cached
logit_list, pred_list, output_dfs = perform_inference(pdf_dir,
                                          model,
                                          processor,
                                          dataset_index,
                                          labeled_json_dir,
                                          has_labels
                                         )

In [None]:
doc_stride = 128

def convert_ner_tags_to_id(ner_tags):
    return [int(label2id[ner_tag]) for ner_tag in ner_tags]

def visual_inputs_with_labels():
    for i in range(len(pred_list)):
        predictions = pred_list[i]
        example = dataset[i]
        image = example["image"]
        words = example["tokens"]
        boxes = example["bboxes"]
        ner_tags = convert_ner_tags_to_id(example["ner_tags"])
        encoding = processor(
            image,
            words,
            boxes=boxes,
            word_labels=ner_tags,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_overflowing_tokens=True,
            max_length=512,  # this is the maximum max_length
            stride=doc_stride,
        )
        yield predictions, encoding, images

def visual_inputs():
    for i in range(len(pred_list)):
        predictions = pred_list[i]
        example = dataset[i]
        image = example["image"]
        words = example["tokens"]
        boxes = example["bboxes"]
        encoding = processor(
            image,
            words,
            boxes=boxes,
            return_tensors="pt",
            truncation=True,
            return_offsets_mapping=True,
            padding="max_length",
            return_overflowing_tokens=True,
            max_length=512,
            stride=doc_stride
        )
        yield predictions, encoding, image

if has_labels:
    gen = visual_inputs_with_labels()
else:
    gen = visual_inputs()

In [None]:
# TODO: also add visualizaton for wrong predictions
predictions, encoding, image = next(gen)
width, height = image.size
token_boxes = encoding.bbox.flatten(start_dim=0, end_dim=1).tolist()
boxes = [unnormalize_box(box, width, height) for box in token_boxes]
if has_labels:
    labels = encoding.labels.flatten(start_dim=0, end_dim=1).tolist()
    predictions = torch.tensor(predictions).view(-1).tolist()
    true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
    true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
    true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]
    draw_boxes_on_img(true_predictions, true_boxes, image, width, height)
else:
    predictions = torch.tensor(predictions).view(-1).tolist()
    true_predictions = [model.config.id2label[pred] for pred in predictions]
    draw_boxes_on_img(true_predictions, boxes, image, width, height)
image

In [None]:
encoding["pixel_values"] = torch.stack(encoding["pixel_values"])
# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = encoding.pop("overflow_to_sample_mapping")
# The offset mappings will give us a map from token to character position in the original context. This will
# help us compute the start_positions and end_positions.
offset_mapping = encoding.pop("offset_mapping")

# Get mode predictions:
- [x] finish vectorizing below cell to get mode predictions
- [ ] add in context to break ties
- [x] flatten out boxes and preds
- [x] check out whether to filter out subwords

See below code snippet to handle out of span docs later and this issue: https://github.com/huggingface/transformers/issues/19190

See this notebook for snippet on subwords:
https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/True_inference_with_LayoutLMv2ForTokenClassification_%2B_Gradio_demo.ipynb

In [None]:
def sort_by_order(target_array):
    label_priority = ["I-Subsidiary", "I-Loc", "I-Own_Per", "B-Subsidiary", "B-Loc", "B-Own_Per", "O"]
    id_priority = [label2id[label] for label in label_priority]
    # Create a priority map from the order array
    priority_map = {val: idx for idx, val in enumerate(id_priority)}

    # Sort the target array based on the priority map
    sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float('inf')))
    return sorted_array

def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):
    # Flatten the tensors
    flat_token_boxes = token_boxes_tensor.view(-1, 4)
    flat_predictions = predictions_tensor.view(-1)
    
    # Filter out invalid boxes
    # valid_mask = (flat_token_boxes != 0).all(dim=1) & (flat_token_boxes != torch.tensor([0, 0, 0, 0])).all(dim=1)
    # valid_boxes = flat_token_boxes[valid_mask]
    # valid_predictions = flat_predictions[valid_mask]
    # valid_boxes = valid_boxes.numpy()
    # valid_predictions = valid_predictions.numpy()
    valid_boxes = flat_token_boxes.numpy()
    valid_predictions = flat_predictions.numpy()
    
    # Find unique boxes and indices 
    # with inverse_indices, you can go from unique_boxes to the original array
    # it gives the indices from unique_boxes that are needed to reconstruct the array
    unique_boxes, inverse_indices = np.unique(valid_boxes, axis=0, return_inverse=True)
    
    # Compute the mode for each unique bounding box
    # Use advanced indexing to group predictions by unique bounding box
    # for each unique box in valid_boxes, create a list with all predictions for that box
    # get the indices in predictions where the corresponding index in boxes is
    # want to be able to go from value in unique_boxes to indices in original array
    unique_box_predictions = [valid_predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))]
    pred_counts = [np.bincount(arr) for arr in unique_box_predictions]
    # Compute the mode of predictions for each group
    # modes = np.array([mode(pred_arr)[0] for pred_arr in unique_box_predictions])
    modes = np.array([sort_by_order(np.where(arr == np.max(arr))[0])[0] for arr in pred_counts])
    flattened_modes = modes[inverse_indices]
    return flattened_modes

In [None]:
test_boxes = torch.tensor([[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 3, 4], [2, 3, 4, 5]], [[1, 2, 3, 4], [2, 3, 4, 5], [2, 3, 4, 6], [0, 0, 0, 0]]])
test_labels = ["O", "B-Subsidiary", "I-Subsidiary", "B-Loc", "B-Subsidiary", "B-Loc", "I-Loc", "O"]
test_preds = torch.tensor([label2id[label] for label in test_labels]).unsqueeze(dim=1)
expected = ["O", "B-Subsidiary", "I-Subsidiary", "B-Loc", "I-Subsidiary", "B-Loc", "I-Loc", "O"]
test_boxes.shape, test_preds.shape

In [None]:
test_boxes

In [None]:
test_preds

In [None]:
modes = get_flattened_mode_predictions(test_boxes, test_preds)

In [None]:
modes

In [None]:
([label2id[label] for label in expected] == modes).all()

In [None]:
mode_predictions.shape, token_boxes_tensor.shape

In [None]:
is_subword = np.array(offset_mapping.flatten(start_dim=0, end_dim=1).tolist())[:,0] != 0

true_predictions = [id2label[pred] for idx, pred in enumerate(mode_predictions) if not is_subword[idx]]
true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(token_boxes_tensor) if not is_subword[idx]]

In [None]:
len(true_predictions), len(true_boxes)

In [None]:
for x in encoding["input_ids"][:2]:
    print(processor.decode(x))

# Return to Looking at Tables

In [None]:
# make sure this lines up with the generator index
example = dataset[2]

In [None]:
example["id"]

In [None]:
# predicted_labels = [model.config.id2label[pred] for pred in predictions]
token_boxes_tensor = encoding.bbox.flatten(start_dim=0, end_dim=1)
predictions_tensor = torch.tensor(predictions)
mode_predictions = get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor) 
token_boxes = encoding.bbox.flatten(start_dim=0, end_dim=1).tolist()
predicted_labels = [model.config.id2label[pred] for pred in mode_predictions]
# words = [processor.decode(token) for token in encoding["input_ids"].flatten()]
simple_preds = [_iob_to_label(pred).lower() for pred in predicted_labels]

In [None]:
len(predicted_labels), len(token_boxes), len(simple_preds)

In [None]:
bbox_cols = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]

In [None]:
df = pd.DataFrame(data=token_boxes, columns=bbox_cols)
# df.loc[:, "word"] = words
df.loc[:, "iob_pred"] = predicted_labels
df.loc[:, "pred"] = simple_preds

invalid_mask = ((df["top_left_x"] == 0) & (df["top_left_y"] == 0) & (df["bottom_right_x"] == 0) & (df["bottom_right_y"] == 0))
df = df[~invalid_mask]

In [None]:
# Note: it seems more correct to use tokens, not words
# BUT if the B and I labels are finicky and can't be fixed, merging on full words
# and then dropping duplicates serves as a fix

# we want to get actual words on the dataframe, not just subwords that correspond to tokens
words_df = pd.DataFrame(data=example["bboxes"], columns=bbox_cols)
words_df.loc[:, "word"] = example["tokens"]

In [None]:
df

In [None]:
df = df.merge(words_df, how="left", on=bbox_cols).drop_duplicates(subset=bbox_cols + ["pred", "word"])

In [None]:
entities_df = df.sort_values(by=["top_left_y", "top_left_x"])
entities_df = entities_df[entities_df["pred"] != "other"]

In [None]:
entities_df

In [None]:
entities_df["group"] = (entities_df['iob_pred'].str.startswith('B-')).cumsum()

In [None]:
# if using tokens not full words, don't join with white space
# grouped_df = entities_df.groupby(["group", "pred"])["word"].apply("".join).reset_index()[["pred", "word"]]
grouped_df = entities_df.groupby(["group", "pred"])["word"].apply(" ".join).reset_index()[["pred", "word"]]

In [None]:
grouped_df

In [None]:
grouped_df["row"] = (grouped_df['pred'].str.startswith('subsidiary')).cumsum()

In [None]:
grouped_df

In [None]:
final_df = grouped_df.pivot_table(index='row', columns='pred', values='word', aggfunc=lambda x: ' '.join(x)).reset_index()

In [None]:
final_df

In [None]:
clean_extracted_df(final_df)

In [None]:
image

In [None]:
# iterate through the pages in the doc, len(token_boxes) is n pages
box_token_dict = {}
for i in range(0, len(token_boxes)):
    # skip first 128 tokens except in the first window
    initial_j = 0 if i == 0 else (doc_stride + 1)
    for j in range(initial_j, len(token_boxes[i])):
        unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
        if (np.asarray(token_boxes[i][j]).shape != (4,)):
            continue
        elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
            continue
        else:
            bbox = tuple(unnormal_box)  # Convert the list to a tuple
            token = processor.tokenizer.decode(encoding["input_ids"][i][j])
            if bbox not in box_token_dict:
                box_token_dict[bbox] = [token]
            else:
                box_token_dict[bbox].append(token)
    box_token_dict = {bbox: ["".join(words)] for bbox, words in box_token_dict.items()}

In [None]:
# TODO: make faster, vectorize
# maybe turn token_boxes and predictions into tensors so this can be vectorized
box_prediction_dict = {}
for i in range(0, len(token_boxes)):
    for j in range(0, len(token_boxes[i])):
        if (np.asarray(token_boxes[i][j]).shape != (4,)):
            continue
        elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
            continue
        else:
            bbox = tuple(token_boxes[i][j])  # Convert the list to a tuple
            prediction = predictions[i][j]
            if bbox not in box_prediction_dict:
                box_prediction_dict[bbox] = [prediction]
            else:
                box_prediction_dict[bbox].append(prediction)

# Begin Old Work

In [None]:
len(words), len(boxes), len(ner_tags)

In [None]:
# true boxes is the unnormalized version of boxes
len(true_predictions), len(true_labels), len(true_boxes)

In [None]:
true_predictions[100], true_labels[100]

In [None]:
len(labels), len(predictions)

In [None]:
simple_labels = []
for label in true_predictions:
    simple_labels.append(_iob_to_label(label).lower())

In [None]:
# can also just generate this dataframe from the PDF text extraction functions
# and then merge on the label
doc_df = pd.DataFrame(data=boxes, columns=["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"])

In [None]:
doc_df.loc[:, "word"] = words
doc_df.loc[:, "label"] = simple_labels

In [None]:
doc_df.head(1)

In [None]:
# probably need some tolerance here
rows_df = doc_df.sort_values(by="top_left_x").groupby(["top_left_y", "label"])["word"].apply(" ".join).reset_index()

In [None]:
rows_df

In [None]:
rows_df = rows_df[rows_df.label != "other"]

In [None]:
extracted_df = rows_df.pivot_table(index='top_left_y', columns='label', values='word', aggfunc=lambda x: ' '.join(x)).reset_index()

In [None]:
extracted_df = extracted_df.drop(columns=["top_left_y"])

In [None]:
extracted_df

In [None]:
extracted_df["own_per"] = extracted_df['own_per'].str.replace('[^\w.]', '', regex=True)

In [None]:
extracted_df