In [1]:
%load_ext autoreload
%autoreload 3

In [92]:
from importlib import resources
import os
from pathlib import Path

import numpy as np
import pandas as pd

from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes
from mozilla_sec_eia.models.sec10k.utils.pdf import get_pdf_data_from_path

Create a directory with all of the validation docs

In [93]:
hist_filename = "ex21_layout_histogram.csv"
hist_df = pd.read_csv(
        resources.files("mozilla_sec_eia.package_data.validation_data")
        / hist_filename,
        comment="#",
    )

In [94]:
hist_df.loc[:, "full_filename"] = "edgar/data/" + hist_df["Filename"].str.split("-", expand=True)[0] + "/" + hist_df["Filename"].str.split("-", expand=True, n=1)[1] + ".txt"

In [95]:
archive = GCSArchive()
md = archive.get_metadata()

In [96]:
pdfs_dir = Path("../sec10k_filings/validation_filings")

In [97]:
archive.get_filings(md.loc[hist_df["full_filename"]], cache_directory=pdfs_dir, cache_pdf=True)

<table> is empty
'<c> 900 West Park Drive LLC Delaware Astrum Software Corporation Delaware Australian Outsourcing Pty Limited Australia Avalon Consulting Group California Boxcar Software, Inc. Delaware Datagen, Inc. Delaware Data General Argentina S.A. Argentina Data General Australia Pty. Ltd. Australia Data General BVI, Ltd. British Virgin Islands Data General Computers Sdn Bhd'
<table> is empty
'<table> </table>'
<table> is empty
'<table> </table>'
<table> is empty
'<c> Mid Ocean Limited -- Cayman Islands Mid Ocean Holdings Ltd. 100 Bermuda Mid Ocean Reinsurance Company Ltd. 100 Bermuda Baltusrol Holdings Limited 51 Bermuda Mid Ocean Reinsurance Consulting GmbH 100 Germany Ridgewood Holdings Ltd. 100 Bermuda The Brockbank Group plc 100 United Kingdom Brockbank Holdings Limited 100 United Kingdom'
<table> is empty
'<c> HNC Software Inc., U.K. United Kingdom 100% HNC Software Inc., Japan Japan 100% Neil Thall Associates, Inc. * Georgia 100% Retek Information Systems, Inc. * Nevada 10

Get the documents in "OCR'ed" format, with bounding boxes and text. Then, create features for each document that we can train a model on.

Add ons:
* We could run this post LayoutLM inference, and filter for just bounding boxes that have entity labels, then create features.

In [154]:
def calculate_features(df):
    features = {}
    features['n_bboxes'] = len(df)
    
    # block density wasn't a very useful feature, maybe rework?
    # Calculate the bounding box density of the area of the page with text
    # x_width = df["bottom_right_x_pdf"].max() - df["top_left_x_pdf"].min()
    # y_height = df["bottom_right_y_pdf"].max() - df["top_left_y_pdf"].min()
    # text_area = x_width * y_height
    # features["block_density"] = features["n_bboxes"] / text_area
    
    # Calculate average y-distance between bounding boxes for a given document
    df = df.sort_values(by=['top_left_y_pdf', 'top_left_x_pdf'])
    y_diffs = df['top_left_y_pdf'].diff().dropna()
    features['avg_y_distance'] = y_diffs.mean()
    features['std_y_distance'] = y_diffs.std()

    # Calculate x-distance to assess horizontal alignment
    x_diffs = df.groupby('top_left_y_pdf')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())
    features['avg_x_distance'] = x_diffs.mean()
    features['std_x_distance'] = x_diffs.std()

    # Define a small threshold to group bounding boxes that are on the same line
    y_threshold = 0.1 
    df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()
    boxes_per_line = df.groupby('line_group').size()
    features['median_boxes_per_line'] = boxes_per_line.median()
    return features

In [155]:
features_df = pd.DataFrame()
for pdf_filename in os.listdir(pdfs_dir):
    if pdf_filename.split(".")[-1] != "pdf":
        continue
    src_path = pdfs_dir / pdf_filename
    extracted, pg = get_pdf_data_from_path(src_path)
    txt = extracted["pdf_text"]
    pg_meta = extracted["page"]
    txt_df = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
    if txt_df.empty:
        continue
    features = calculate_features(txt_df)
    features_df = pd.concat([features_df, pd.DataFrame(index=[pdf_filename.split(".")[0]], data=features)])

In [156]:
features_df.head(1)

Unnamed: 0,n_bboxes,avg_y_distance,std_y_distance,avg_x_distance,std_x_distance,median_boxes_per_line
1739566-0001739566-21-000088,147,3.012425,8.213566,101.908806,156.981918,5.0


Create y labels from the layout histogram

In [145]:
hist_df["Layout Type"].unique()

array(['Subsidiary List', 'Blue & White Table (3 Column)',
       'List with Sentences', 'Generic Table',
       'List with Indented Nested Subsidiaries', 'Paragraph',
       'Blue & White Table (2 Column)',
       'Table with 2 Subsidiary Name Columns'], dtype=object)

In [146]:
hist_df.loc[:, 'label'] = np.where(hist_df['Layout Type'] == 'Paragraph', 1, 0)

In [147]:
hist_df.head(1)

Unnamed: 0,Filename,Layout Type,Your Initials,full_filename,label
0,38079-0001558370-16-004332,Subsidiary List,KL,edgar/data/38079/0001558370-16-004332.txt,0


In [157]:
input_df = features_df.merge(hist_df[["Filename", "label", "full_filename"]], how="left", left_index=True, right_on="Filename").reset_index(drop=True)

In [149]:
input_df.columns

Index(['n_bboxes', 'block_density', 'avg_y_distance', 'std_y_distance',
       'avg_x_distance', 'std_x_distance', 'median_boxes_per_line', 'Filename',
       'label', 'full_filename'],
      dtype='object')

Train a classifier

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [158]:
X = input_df.drop(columns=["label", "full_filename", "Filename"])
y = input_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       0.92      0.92      0.92        12

    accuracy                           0.94        33
   macro avg       0.93      0.93      0.93        33
weighted avg       0.94      0.94      0.94        33



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [159]:
coefficients = model.coef_[0]

# Create a dataframe to display feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

In [160]:
feature_importance

Unnamed: 0,Feature,Coefficient
5,median_boxes_per_line,0.724693
4,std_x_distance,0.014897
0,n_bboxes,-0.001819
3,avg_x_distance,-0.086826
1,avg_y_distance,-0.264036
2,std_y_distance,-0.574559


In [129]:
y_pred

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0])

In [131]:
input_df.head(2)

Unnamed: 0,n_bboxes,block_density,avg_y_distance,std_y_distance,avg_x_distance,std_x_distance,Filename,label,full_filename
0,147,0.000358,3.012425,8.213566,101.908806,156.981918,1739566-0001739566-21-000088,0,edgar/data/1739566/0001739566-21-000088.txt
1,137,0.001447,0.687791,2.9635,48.664894,27.802301,61986-0000061986-99-000003,1,edgar/data/61986/0000061986-99-000003.txt


In [134]:
results_df = X_test.merge(input_df[["Filename", "label"]], how="left", left_index=True, right_index=True)
results_df.loc[:, "pred_label"] = y_pred

In [135]:
results_df

Unnamed: 0,n_bboxes,block_density,avg_y_distance,std_y_distance,avg_x_distance,std_x_distance,Filename,label,pred_label
9,1032,0.001123,0.95982,2.231205,127.400566,207.930161,4962-0001193125-10-041232,0,0
50,99,0.001405,0.681775,2.955458,49.789227,40.322922,81033-0000950117-06-000927,1,1
45,440,0.000505,2.18092,5.329875,124.02813,160.601257,40545-0000040545-04-000013,0,0
39,71,9.5e-05,11.282914,22.625301,102.355408,88.687737,716646-0000950135-06-004150,0,0
7,37,0.000199,5.475447,11.809794,104.781906,107.117111,320340-0000950123-10-027168,0,0
85,964,0.001198,1.022415,2.039834,162.664932,197.020065,315858-0000315858-19-000023,0,0
51,840,0.000941,1.177622,2.447244,28.503,12.000216,860546-0001104659-07-015618,0,1
66,96,0.00081,1.290955,5.511497,28.687408,12.497664,320575-0001193125-07-117419,0,1
38,104,0.00146,0.648679,2.885869,43.733025,21.256653,205402-0000950114-99-000043,1,1
62,100,0.000389,2.792539,6.367428,142.057449,263.204254,70145-0001193125-11-321222,0,0
