# Exhibit 21 extraction

This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)
from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing
company.

In [1]:
import dagstermill

context = dagstermill.get_context(op_config={
    "uri": "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor",
    "training_set": "labeledv0.2",
})

## Train Layoutlmv3

### Setup training/test sets

Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.

First define several helper functions to do the conversion.

In [2]:
import json
import os
from pathlib import Path
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd

from mozilla_sec_eia.library import validation_helpers
from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename
from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes
from mozilla_sec_eia.models.sec10k.utils.pdf import (
    get_pdf_data_from_path,
    render_page,
)

# Set some constants
LABELS = [
    "O",
    "B-Subsidiary",
    "I-Subsidiary",
    "B-Loc",
    "I-Loc",
    "B-Own_Per",
    "I-Own_Per",
]
LABEL_PRIORITY = [
    "I-Subsidiary",
    "I-Loc",
    "I-Own_Per",
    "B-Subsidiary",
    "B-Loc",
    "B-Own_Per",
    "O",
]

BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]
BBOX_COLS_PDF = [
    "top_left_x_pdf",
    "top_left_y_pdf",
    "bottom_right_x_pdf",
    "bottom_right_y_pdf",
]

# Map back and forth between id's and labels
id2label = dict(enumerate(LABELS))
label2id = {v: k for k, v in enumerate(LABELS)}

def _is_cik_in_training_data(labeled_json_filename, tracking_df):
    # TODO: for now CIK is stored as an int, update when fixed
    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
    return cik in tracking_df.CIK.unique()


def format_label_studio_output(
    labeled_json_dir: Path,
    pdfs_dir: Path,
) -> pd.DataFrame:
    """Format Label Studio output JSONs into dataframe."""
    labeled_df = pd.DataFrame()
    # TODO: make this path stuff less janky?
    tracking_df = validation_helpers.load_training_data("ex21_labels.csv")
    for json_filename in os.listdir(labeled_json_dir):
        if not json_filename[0].isdigit() or json_filename.endswith(".json"):
            continue
        json_file_path = labeled_json_dir / json_filename
        with Path.open(json_file_path) as j:
            doc_dict = json.loads(j.read())

        filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
        # check if old local naming schema is being used
        if len(filename.split("-")) == 6:
            filename = "-".join(filename.split("-")[2:])
        if not _is_cik_in_training_data(filename, tracking_df=tracking_df):
            continue

        pdf_filename = filename + ".pdf"
        src_path = pdfs_dir / pdf_filename
        extracted, pg = get_pdf_data_from_path(src_path)
        txt = extracted["pdf_text"]
        pg_meta = extracted["page"]
        # normalize bboxes between 0 and 1000 for Hugging Face
        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
        # parse the output dictionary of labeled bounding boxes from Label Studio
        doc_df = pd.DataFrame()
        for item in doc_dict["result"]:
            value = item["value"]
            # sometimes Label Studio will fill in an empty list as a label
            # when there is really no label
            # TODO: do this without dict comprehension?
            if ("labels" in value) and value["labels"] == []:
                value = {k: v for k, v in value.items() if k != "labels"}
            ind = int(item["id"].split("_")[-1])
            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])

        # combine the bounding boxes for each word
        doc_df = doc_df.groupby(level=0).first()
        txt.loc[:, "id"] = filename
        # TODO: probably want to filter out these empty Ex. 21 docs
        # the doc might not have any labels in it if it was an empty Ex. 21
        if "labels" not in doc_df:
            doc_df.loc[:, "labels"] = pd.Series()

        output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)
        labeled_df = pd.concat([labeled_df, output_df])

    # fill in unlabeled words and clean up labeled dataframe
    labeled_df["labels"] = labeled_df["labels"].fillna("O")
    labeled_df = labeled_df.rename(columns={"labels": "ner_tag"})
    non_id_columns = [col for col in labeled_df.columns if col != "id"]
    labeled_df = labeled_df.loc[:, ["id"] + non_id_columns]

    # TODO: add in sanity checks on labeled_df bounding boxes to make sure
    # that no value is above 1000 or below 0

    return labeled_df


def get_image_dict(pdfs_dir):
    """Create a dictionary with filenames and their Ex. 21 images."""
    image_dict = {}
    for pdf_filename in os.listdir(pdfs_dir):
        if pdf_filename.split(".")[-1] != "pdf":
            continue
        pdf_file_path = pdfs_dir / pdf_filename
        _, pg = get_pdf_data_from_path(pdf_file_path)
        full_pg_img = render_page(pg)
        filename = pdf_filename.split(".")[0]
        image_dict[filename] = full_pg_img
    return image_dict


def format_as_ner_annotations(
    labeled_json_path: Path,
    pdfs_path: Path,
    gcs_folder_name: Path,
) -> list[dict]:
    """Format a Label Studio output JSONs as NER annotations.

    Formats the dataframe as named entity recognition annotations.
    # TODO: say more about this format

    Returns:
        ner_annotations: a list of dicts, with one dict for each doc.
    """
    GCSArchive().cache_training_data(
        json_cache_path=labeled_json_path,
        pdf_cache_path=pdfs_path,
        gcs_folder_name=gcs_folder_name
    )

    labeled_df = format_label_studio_output(
        labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path
    )
    # convert dataframe/dictionary into NER format
    # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py
    # complete dataset is a list of dicts, with one dict for each doc
    doc_filenames = labeled_df["id"].unique()
    image_dict = get_image_dict(pdfs_dir=pdfs_path)
    ner_annotations = []
    for filename in doc_filenames:
        annotation = {
            "id": filename,
            "tokens": labeled_df.groupby("id")["text"].apply(list).loc[filename],
            "ner_tags": labeled_df.groupby("id")["ner_tag"].apply(list).loc[filename],
            "bboxes": labeled_df.loc[labeled_df["id"] == filename, :][BBOX_COLS_PDF]
            .to_numpy()
            .tolist(),
            "image": image_dict[filename],
        }
        ner_annotations.append(annotation)

    return ner_annotations

def _prepare_dataset(annotations, processor, label2id):
    """Put the dataset in its final format for training LayoutLM."""

    def _convert_ner_tags_to_id(ner_tags, label2id):
        return [int(label2id[ner_tag]) for ner_tag in ner_tags]

    images = annotations["image"]
    words = annotations["tokens"]
    boxes = annotations["bboxes"]
    # Map over labels and convert to numeric id for each ner_tag
    ner_tags = [
        _convert_ner_tags_to_id(ner_tags, label2id)
        for ner_tags in annotations["ner_tags"]
    ]

    encoding = processor(
        images,
        words,
        boxes=boxes,
        word_labels=ner_tags,
        truncation=True,
        padding="max_length",
    )

    return encoding

def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):
    """Compute metrics to train and evaluate the model on."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

#### Finetune Model
The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.

Model training contains several steps implemented below:
1. Use temporary path to convert filings to PDF's and stash labels
2. Use PDF's and labels to convert PDF's and labels to NER annotations
3. Construct huggingface dataset from NER annotations and split into train and test sets
4. Load pretrained model from huggingface
5. Finetune model on training data and evaluate on test data

In [3]:
import mlflow
from datasets import (
    Array2D,
    Array3D,
    Dataset,
    Features,
    Sequence,
    Value,
    load_metric,
)
from dotenv import load_dotenv
from transformers import (
    AutoProcessor,
    LayoutLMv3ForTokenClassification,
    Trainer,
    TrainingArguments,
)
from transformers.data.data_collator import default_data_collator

from mozilla_sec_eia.library.mlflow import configure_mlflow

load_dotenv()


configure_mlflow()
mlflow.set_experiment("exhibit21_extraction_test")

# Only finetune if configured to do so
training_run_id = None
if context.op_config["uri"] is None:
    # Change temp_dir to save training data locally for inspection
    with TemporaryDirectory() as temp_dir:
        ner_annotations = format_as_ner_annotations(
            labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons",
            pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs",
            gcs_folder_name=context.op_config["training_set"],
        )

    # Cache/prepare training data
    dataset = Dataset.from_list(ner_annotations)

    # Load pretrained model
    model = LayoutLMv3ForTokenClassification.from_pretrained(
        "microsoft/layoutlmv3-base", id2label=id2label, label2id=label2id
    )
    processor = AutoProcessor.from_pretrained(
        "microsoft/layoutlmv3-base", apply_ocr=False
    )

    # Prepare our train & eval dataset
    column_names = dataset.column_names
    features = Features(
        {
            "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(feature=Value(dtype="int64")),
        }
    )
    dataset = dataset.map(
        lambda annotations: _prepare_dataset(annotations, processor, label2id),
        batched=True,
        remove_columns=column_names,
        features=features,
    )
    dataset.set_format("torch")
    split_dataset = dataset.train_test_split(test_size=0.2)
    train_dataset, eval_dataset = split_dataset["train"], split_dataset["test"]

    # Initialize our Trainer
    metric = load_metric("seqeval")
    training_args = TrainingArguments(
        max_steps=1000,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=1e-5,
        evaluation_strategy="steps",
        eval_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        output_dir="./layoutlm",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor,
        data_collator=default_data_collator,
        compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),
    )

    with mlflow.start_run() as training_run:
        # Train inside mlflow run. Mlflow will automatically handle logging training metrcis
        trainer.train()

        # Log finetuend model with mlflow
        model = {"model": trainer.model, "tokenizer": trainer.tokenizer}
        mlflow.transformers.log_model(
            model, artifact_path="layoutlm_extractor", task="token-classification"
        )
        training_run_id = training_run.info. run_id

## Model inference
Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps.

In [4]:
import torch
from transformers import Pipeline, pipeline
from transformers.tokenization_utils_base import BatchEncoding

from mozilla_sec_eia.models.sec10k.inference import get_flattened_mode_predictions
from mozilla_sec_eia.models.sec10k.utils.layoutlm import (
    iob_to_label,
)


class LayoutLMInferencePipeline(Pipeline):
    """Pipeline for performing inference with fine-tuned LayoutLM."""

    def __init__(self, *args, **kwargs):
        """Initialize LayoutLMInferencePipeline."""
        super().__init__(*args, **kwargs)

    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, doc_dict):
        """Encode and tokenize model inputs."""
        image = doc_dict["image"]
        words = doc_dict["tokens"]
        boxes = doc_dict["bboxes"]
        encoding = self.tokenizer(
            image,
            words,
            boxes=boxes,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=512,  # this is the maximum max_length
            stride=128,
            return_offsets_mapping=True,
            return_overflowing_tokens=True,
        )
        model_inputs = {}
        model_inputs["raw_encoding"] = encoding.copy()
        model_inputs["doc_dict"] = doc_dict
        model_inputs["offset_mapping"] = encoding.pop("offset_mapping")
        model_inputs["sample_mapping"] = encoding.pop("overflow_to_sample_mapping")
        # TODO: do we actually need to make these into ints?
        encoding["input_ids"] = encoding["input_ids"].to(torch.int64)
        encoding["attention_mask"] = encoding["attention_mask"].to(torch.int64)
        encoding["bbox"] = encoding["bbox"].to(torch.int64)
        encoding["pixel_values"] = torch.stack(encoding["pixel_values"])
        model_inputs["encoding"] = encoding
        return model_inputs

    def _forward(self, model_inputs):
        # encoding is passed as a UserDict in the model_inputs dictionary
        # turn it back into a BatchEncoding
        encoding = BatchEncoding(model_inputs["encoding"])
        if torch.cuda.is_available():
            encoding.to("cuda")
            self.model.to("cuda")
        # since we're doing inference, we don't need gradient computation
        with torch.no_grad():
            output = self.model(**encoding)
            return {
                "logits": output.logits,
                "predictions": output.logits.argmax(-1).squeeze().tolist(),
                "raw_encoding": model_inputs["raw_encoding"],
                "doc_dict": model_inputs["doc_dict"],
            }

    def postprocess(self, all_outputs):
        """Return logits, model predictions, and the extracted dataframe."""
        logits = all_outputs["logits"]
        predictions = all_outputs["logits"].argmax(-1).squeeze().tolist()
        output_df = self.extract_table(all_outputs)
        return logits, predictions, output_df

    def extract_table(self, all_outputs):
        """Extract a structured table from a set of inference predictions.

        This function essentially works by stacking bounding boxes and predictions
        into a dataframe and going from left to right and top to bottom. Then, every
        every time a new subsidiary entity is encountered, it assigns a new group or
        "row" to that subsidiary. Next, location and ownership percentage words/labeled
        entities in between these subsidiary groups are assigned to a subsidiary row/group.
        Finally, this is all formatted into a dataframe with an ID column from the original
        filename and a basic cleaning function normalizes strings.
        """
        # TODO: when model more mature, break this into sub functions to make it
        # clearer what's going on
        predictions = all_outputs["predictions"]
        encoding = all_outputs["raw_encoding"]
        doc_dict = all_outputs["doc_dict"]

        token_boxes_tensor = encoding["bbox"].flatten(start_dim=0, end_dim=1)
        predictions_tensor = torch.tensor(predictions)
        mode_predictions = get_flattened_mode_predictions(
            token_boxes_tensor, predictions_tensor
        )
        token_boxes = encoding["bbox"].flatten(start_dim=0, end_dim=1).tolist()
        predicted_labels = [
            self.model.config.id2label[pred] for pred in mode_predictions
        ]
        simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]

        df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)
        df.loc[:, "iob_pred"] = predicted_labels
        df.loc[:, "pred"] = simple_preds
        invalid_mask = (
            (df["top_left_x"] == 0)
            & (df["top_left_y"] == 0)
            & (df["bottom_right_x"] == 0)
            & (df["bottom_right_y"] == 0)
        )
        df = df[~invalid_mask]
        # we want to get actual words on the dataframe, not just subwords that correspond to tokens
        # subwords from the same word share the same bounding box coordinates
        # so we merge the original words onto our dataframe on bbox coordinates
        words_df = pd.DataFrame(data=doc_dict["bboxes"], columns=BBOX_COLS)
        words_df.loc[:, "word"] = doc_dict["tokens"]
        df = df.merge(words_df, how="left", on=BBOX_COLS).drop_duplicates(
            subset=BBOX_COLS + ["pred", "word"]
        )
        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)
        # should always have a B entity label. Manually override labels so this is true.
        first_in_group_df = df[
            (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other")
        ]
        first_in_group_df.loc[:, "iob_pred"] = (
            "B" + first_in_group_df["iob_pred"].str[1:]
        )
        df.update(first_in_group_df)
        # filter for just words that were labeled with non "other" entities
        entities_df = df.sort_values(by=["top_left_y", "top_left_x"])
        entities_df = entities_df[entities_df["pred"] != "other"]
        # words are labeled with IOB format which stands for inside, outside, beginning
        # merge B and I entities to form one entity group
        # (i.e. "B-Subsidiary" and "I-Subsidiary" become just "subsidiary"), assign a group ID
        entities_df["group"] = (entities_df["iob_pred"].str.startswith("B-")).cumsum()
        grouped_df = (
            entities_df.groupby(["group", "pred"])["word"]
            .apply(" ".join)
            .reset_index()[["pred", "word"]]
        )
        # assign a new row every time there's a new subsidiary
        grouped_df["row"] = (grouped_df["pred"].str.startswith("subsidiary")).cumsum()
        output_df = grouped_df.pivot_table(
            index="row", columns="pred", values="word", aggfunc=lambda x: " ".join(x)
        ).reset_index()
        if output_df.empty:
            return output_df
        output_df.loc[:, "id"] = doc_dict["id"]
        return output_df

Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs.

In [5]:
from PIL import Image

from mozilla_sec_eia.models.sec10k.entities import (
    Ex21CompanyOwnership,
    Sec10kExtractionMetadata,
)


def clean_extracted_df(extracted_df):
    """Perform basic cleaning on a dataframe extracted from an Ex. 21."""
    if extracted_df.empty:
        return extracted_df
    if "row" in extracted_df.columns:
        extracted_df = extracted_df.drop(columns=["row"])
    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.strip().str.lower()
    # strip special chars from the start and end of the string
    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.replace(
        r"^[^\w&\s]+|[^\w&\s]+$", "", regex=True
    )
    if "loc" in extracted_df.columns:
        extracted_df["loc"] = extracted_df["loc"].str.strip().str.lower()
        extracted_df["loc"] = extracted_df["loc"].str.replace(
            r"[^a-zA-Z&,\s]", "", regex=True
        )
    if "own_per" in extracted_df.columns:
        # remove special chars and letters
        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
            r"[^\d.]", "", regex=True
        )
        # Find values with multiple decimal points
        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
            r"(\d*\.\d+)\..*", r"\1", regex=True
        )
        extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan)
        extracted_df["own_per"] = extracted_df["own_per"].astype(
            "float64", errors="ignore"
        )
    # drop rows that have a null subsidiary value
    extracted_df = extracted_df.dropna(subset="subsidiary")
    return extracted_df

# If a model was trained in this notebook, use it. Otherwise, use
if training_run_id is not None:
    model_uri = f"runs:/{training_run_id}/layoutlm_extractor"
else:
    model_uri = context.op_config["uri"]

model_info = mlflow.models.get_model_info(model_uri)

def _get_data(dataset):
    yield from dataset

class Ex21Extractor(mlflow.pyfunc.PythonModel):
    """Create an mlflow pyfunc model to perform full EX21 extraction."""
    def load_context(self, context):
        """Load pretrained model."""
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        self.model_components = mlflow.transformers.load_model(
            context.artifacts["model_components"], return_type="components"
        )

    def predict(self, context, model_input: pd.DataFrame, params=None):
        """Use pretrained model and inference pipeline to perform inference."""
        # Convert dataframe to pyarrow Dataset
        model_input["image"] = model_input.apply(
            lambda row: Image.frombytes(
                row["mode"], (row["width"], row["height"]), row["image"]
            ),
            axis=1,
        )
        dataset = Dataset.from_list(model_input.drop(["mode", "width", "height"], axis=1).to_dict("records"))

        # TODO: figure out device argument
        pipe = pipeline(
            "token-classification",
            model=self.model_components["model"],
            tokenizer=self.model_components["tokenizer"],
            pipeline_class=LayoutLMInferencePipeline,
        )

        logits = []
        predictions = []
        all_output_df = Ex21CompanyOwnership.example(size=0)
        extraction_metadata = Sec10kExtractionMetadata.example(size=0)
        for logit, pred, output_df in pipe(_get_data(dataset)):
            logits.append(logit)
            predictions.append(pred)
            if not output_df.empty:
                filename = get_metadata_filename(output_df["id"].iloc[0])
                extraction_metadata.loc[filename, ["success"]] = True
            all_output_df = pd.concat([all_output_df, output_df])
        all_output_df.columns.name = None
        all_output_df = clean_extracted_df(all_output_df)
        all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
        all_output_df = all_output_df.reset_index(drop=True)
        return extraction_metadata, all_output_df

# Save model to local temp dir with artifacts, then reload for evaluation
with TemporaryDirectory() as tmp_dir:
    mlflow.pyfunc.save_model(
        path=tmp_dir,
        python_model=Ex21Extractor(),
        artifacts={"model_components": model_uri},
    )
    ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2024/09/24 20:17:30 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference


Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Model Evaluation
Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately.

In [6]:
def clean_ex21_validation_set(validation_df: pd.DataFrame):
    """Clean Ex. 21 validation data to match extracted format."""
    validation_df = validation_df.rename(
        columns={
            "Filename": "id",
            "Subsidiary": "subsidiary",
            "Location of Incorporation": "loc",
            "Ownership Percentage": "own_per",
        }
    )
    validation_df["own_per"] = validation_df["own_per"].astype(str)
    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
    validation_df = clean_extracted_df(validation_df)
    return validation_df

# Load labeled validation set
validation_set = clean_ex21_validation_set(
    validation_helpers.load_validation_data("ex21_labels.csv")
)

# Get filing metadata for filings in validation set
cloud_interface = GCSArchive()
filing_metadata = cloud_interface.get_metadata()
ex21_validation_filing_metadata = filing_metadata[
    filing_metadata.index.isin(validation_set["filename"].unique())
]

Next define methods evaluating model output, then run extraction and log in child run.

In [7]:
from mlflow.models import infer_signature

from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset


def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
    """Compute validation metrics for Ex. 21 extraction."""
    shared_cols = validation_df.columns.intersection(computed_df.columns)
    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
    n_equal = 0
    validation_filenames = validation_df["id"].unique()
    n_files = len(validation_filenames)
    table_metrics_dict = {}
    jaccard_dict = {}
    incorrect_files = []
    # iterate through each file and check each extracted table
    for filename in validation_filenames:
        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
            drop=True
        )
        validation_table_df = validation_df[
            validation_df["id"] == filename
        ].reset_index(drop=True)
        # check if the tables are exactly equal
        if extracted_table_df.equals(validation_table_df):
            # TODO: strip llc and other company strings before comparison
            n_equal += 1
        else:
            incorrect_files.append(filename)
        # compute precision and recall for each column
        table_metrics_dict[filename] = {}
        jaccard_dict[filename] = {}
        for col in ["subsidiary", "loc", "own_per"]:
            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
                extracted_table_df, validation_table_df, value_col=col
            )
            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
                "precision"
            ]
            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
            # get the jaccard similarity between columns
            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
                computed_df=extracted_table_df,
                validation_df=validation_table_df,
                value_col=col,
            )

    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
    prec_recall_df = pd.DataFrame.from_dict(
        table_metrics_dict, orient="index"
    ).reset_index()

    return (
        jaccard_df,
        prec_recall_df,
        pd.DataFrame({"filename": incorrect_files}),
        {
            "table_accuracy": n_equal / n_files,
            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
            / n_files,
            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
            / n_files,
            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
            / n_files,
            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
        },
    )


with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):
    failed_metadata, dataset = create_inference_dataset(
        filing_metadata=ex21_validation_filing_metadata,
        cloud_interface=cloud_interface,
        has_labels=False,
    )
    metadata, extracted = ex21_extraction_model.predict(dataset.copy())
    metadata = pd.concat([failed_metadata, metadata])

    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)
    mlflow.log_metrics(metrics)
    mlflow.pyfunc.log_model(
        "exhibit21_extractor",
        python_model=Ex21Extractor(),
        artifacts={"model_components": model_uri},
        signature=infer_signature(dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature
    )

2024/09/24 20:18:01 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  padded_validation_set = pd.concat(
  padded_compute_set = pd.concat(


Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-finch-744 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/b959cfa0ba3c4b91a0f8fe158cd0109f.
2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.
2024/09/24 20:19:41 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/09/24 20:19:42 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
