# Compare an Ex. 21 extracted table to a validation table

In [1]:
%load_ext autoreload
%autoreload 3

In [31]:
import pandas as pd
from pathlib import Path

from mozilla_sec_eia.models.sec10k import defs
from mozilla_sec_eia.models.sec10k.utils.layoutlm import _load_pretrained_layoutlm
from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset
from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
from mozilla_sec_eia.models.sec10k.ex_21 import Exhibit21Extractor
from mozilla_sec_eia.models.sec10k.ex_21.inference import extract_filings
from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (
    clean_ex21_validation_set,
)
from mozilla_sec_eia.library.mlflow.mlflow_resource import _configure_mlflow
from dotenv import load_dotenv
load_dotenv()

True

In [135]:
layout_hist = pd.read_csv("validation_layout_histogram.csv")

In [132]:
layout_hist["Layout Type"].value_counts()

Layout Type
Generic Table                             27
Paragraph                                 12
List with Indented Nested Subsidiaries    10
Subsidiary List                            9
List with Sentences                        5
Blue & White Table (2 Column)              5
Blue & White Table (3 Column)              3
Table with 2 Subsidiary Name Columns       1
Name: count, dtype: int64

In [147]:
layout_hist.loc[:, "full_filename"] = "edgar/data/" + layout_hist["Filename"].str.split("-", expand=True)[0] + "/" + layout_hist["Filename"].str.split("-", expand=True, n=1)[1] + ".txt"

In [830]:
validation_df = pd.read_csv("../src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv")

In [331]:
archive = GCSArchive()
md = archive.get_metadata()

In [824]:
df = pd.read_parquet("gs://mlflow-artifacts-mozilla/8/f8ad133ee4cb48f5883452e558d97d21/artifacts/ex21_company_ownership_info_validation.parquet")

In [825]:
incorrect_df = pd.read_parquet("gs://mlflow-artifacts-mozilla/8/f8ad133ee4cb48f5883452e558d97d21/artifacts/ex21_incorrect_filenames.parquet")

In [826]:
validation_md = pd.read_parquet("gs://mlflow-artifacts-mozilla/8/f8ad133ee4cb48f5883452e558d97d21/artifacts/ex21_extraction_metadata_validation.parquet")

In [827]:
jaccard_df = pd.read_parquet("gs://mlflow-artifacts-mozilla/8/f8ad133ee4cb48f5883452e558d97d21/artifacts/ex21_jaccard_per_table.parquet")

In [828]:
hist = layout_hist[layout_hist.full_filename.isin(validation_md.reset_index(names="filename")["filename"])]
print(len(hist))
incorrect_df.merge(
    hist[["Filename", "Layout Type"]],
    left_on="filename",
    right_on="Filename"
)["Layout Type"].value_counts()/hist["Layout Type"].value_counts()

60


Layout Type
Blue & White Table (2 Column)             0.200000
Blue & White Table (3 Column)             0.333333
Generic Table                             0.370370
List with Indented Nested Subsidiaries    0.500000
List with Sentences                       0.600000
Subsidiary List                           0.444444
Table with 2 Subsidiary Name Columns      1.000000
Name: count, dtype: float64

In [829]:
hist = layout_hist[layout_hist.full_filename.isin(validation_md.reset_index(names="filename")["filename"])]
jaccard_df.merge(hist, how="left", left_on="index", right_on="Filename").groupby("Layout Type")["subsidiary"].mean()

Layout Type
Blue & White Table (2 Column)             1.000000
Blue & White Table (3 Column)             0.923077
Generic Table                             0.957857
List with Indented Nested Subsidiaries    0.947781
List with Sentences                       0.750000
Subsidiary List                           0.832634
Table with 2 Subsidiary Name Columns      1.000000
Name: subsidiary, dtype: float64

In [956]:
filename = incorrect_df.iloc[0].filename

In [957]:
filename

'61339-0001161728-17-000004'

In [958]:
validation_table = clean_ex21_validation_set(validation_df[validation_df["Filename"] == filename])

  extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan)


In [959]:
validation_table.head(1)

Unnamed: 0,id,subsidiary,loc,own_per,filename
126,61339-0001161728-17-000004,madison gas and electric company,,,edgar/data/61339/0001161728-17-000004.txt


In [960]:
extracted_table = df[df.id == filename]

In [948]:
pd.concat([validation_table, extracted_table]).drop_duplicates(subset=["subsidiary", "loc", "own_per"], keep=False)

Unnamed: 0,id,subsidiary,loc,own_per,filename
2343,891014-0000891014-11-000007,app china specialty minerals pte ltd,singapore,,edgar/data/891014/0000891014-11-000007.txt
2344,891014-0000891014-11-000007,asmas agir sanayi malzemeleri imal ve tic. a.s,turkey,,edgar/data/891014/0000891014-11-000007.txt
2345,891014-0000891014-11-000007,barretts minerals inc,delaware,,edgar/data/891014/0000891014-11-000007.txt
2347,891014-0000891014-11-000007,gold lun chemicals (zhenjiang,china,,edgar/data/891014/0000891014-11-000007.txt
2348,891014-0000891014-11-000007,"gold sheng chemicals (zhenjiang) co., ltd",china,,edgar/data/891014/0000891014-11-000007.txt
...,...,...,...,...,...
1409,891014-0000891014-11-000007,sr.o. specialty minerals south africa (pty,south africa,,
1410,891014-0000891014-11-000007,limited specialty minerals (thailand,thailand,,
1411,891014-0000891014-11-000007,limited specialty minerals uk limited,united kingdom,,
1412,891014-0000891014-11-000007,"tecnologias minerales de mexico, s.a. de",mexico,,


In [912]:
filing = archive.get_filings(md.loc[["edgar/data/56679/0001193125-16-634657.txt"]].reset_index(), cache_directory=Path("../sec10k_filings/", cache_pdf=True))

## See what the entities labels are for the extracted doc

In [28]:
from dotenv import dotenv_values
values = dotenv_values()

In [29]:
_configure_mlflow(values["MLFLOW_TRACKING_URI"], values["GCS_PROJECT"])

In [67]:
cache_path = "/Users/katielamb/CatalystCoop/dagster_home/model_cache/layoutlm"

In [None]:
model = _load_pretrained_layoutlm(cache_path)

In [36]:
archive = GCSArchive()
md = archive.get_metadata()

In [46]:
tracking_df = pd.read_csv("../labeled_data_tracking.csv", dtype={"CIK": str}, comment="#")

In [47]:
tracking_df

Unnamed: 0.1,Unnamed: 0,CIK,Filename,Initials,Notes
0,1,354707,edgar/data/354707/0000354707-19-000043.txt,KL,
1,2,61339,edgar/data/61339/0001161728-17-000004.txt,KL,
2,3,1317577,edgar/data/1317577/0001193125-13-356794.txt,KL,
3,4,59527,edgar/data/59527/0000059527-20-000007.txt,KL,
4,5,40545,edgar/data/40545/0000040545-04-000013.txt,KL,
...,...,...,...,...,...
155,159,1555177,edgar/data/1555177/0001555177-17-000011.txt,KL,
156,160,1142129,edgar/data/1142129/0001493152-17-005793.txt,KL,
157,161,1059025,edgar/data/1059025/0000934665-99-000002.txt,KL,
158,162,318996,edgar/data/318996/0000318996-18-000007.txt,KL,


In [49]:
filing_of_interest = md.loc[["edgar/data/38079/0001558370-16-004332.txt"]]

In [None]:
extractor = Exhibit21Extractor(cloud_interface=archive)
md, extracted = extract_filings(extractor, filings_of_interest, model)