In [1]:
%load_ext autoreload
%autoreload 3

In [75]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes
from mozilla_sec_eia.models.sec10k.utils.pdf import get_pdf_data_from_path

Create a directory with all of the validation docs

In [3]:
hist_df = pd.read_csv("validation_layout_histogram.csv")

In [5]:
hist_df.loc[:, "full_filename"] = "edgar/data/" + hist_df["Filename"].str.split("-", expand=True)[0] + "/" + hist_df["Filename"].str.split("-", expand=True, n=1)[1] + ".txt"

In [6]:
archive = GCSArchive()
md = archive.get_metadata()

In [11]:
pdfs_dir = Path("../sec10k_filings/validation_filings")

In [None]:
archive.get_filings(md.loc[hist_df["full_filename"]], cache_directory=pdfs_dir, cache_pdf=True)

Get the documents in "OCR'ed" format, with bounding boxes and text. Then, create features for each document that we can train a model on.

Add ons:
* We could run this post LayoutLM inference, and filter for just bounding boxes that have entity labels, then create features.

In [61]:
def calculate_features(df):
    features = {}
    features['n_bboxes'] = len(df)
    
    # Calculate the bounding box density of the area of the page with text
    x_width = df["top_left_x_pdf"].max() - df["top_left_x_pdf"].min()
    y_height = df["top_left_y_pdf"].max() - df["top_left_y_pdf"].min()
    text_area = x_width * y_height
    features["block_density"] = features["n_bboxes"] / text_area
    
    # Calculate average y-distance between bounding boxes for a given document
    df = df.sort_values(by=['top_left_y_pdf', 'top_left_x_pdf'])
    y_diffs = df['top_left_y_pdf'].diff().dropna()
    features['avg_y_distance'] = y_diffs.mean()
    features['std_y_distance'] = y_diffs.std()

    # Calculate x-distance to assess horizontal alignment
    x_diffs = df.groupby('top_left_y_pdf')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())
    features['avg_x_distance'] = x_diffs.mean()
    features['std_x_distance'] = x_diffs.std()
    
    return features

In [71]:
features_df = pd.DataFrame()
for pdf_filename in os.listdir(pdfs_dir):
    if pdf_filename.split(".")[-1] != "pdf":
        continue
    src_path = pdfs_dir / pdf_filename
    extracted, pg = get_pdf_data_from_path(src_path)
    txt = extracted["pdf_text"]
    pg_meta = extracted["page"]
    txt_df = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
    features = calculate_features(txt_df)
    features_df = pd.concat([features_df, pd.DataFrame(index=[pdf_filename.split(".")[0]], data=features)])

In [72]:
features_df.head(1)

Unnamed: 0,n_bboxes,block_density,avg_y_distance,std_y_distance,avg_x_distance,std_x_distance
1739566-0001739566-21-000088,147,0.000383,3.012425,8.213566,101.908806,156.981918


Create y labels from the layout histogram

In [73]:
hist_df["Layout Type"].unique()

array(['Subsidiary List', 'Blue & White Table (3 Column)',
       'List with Sentences', 'Generic Table',
       'List with Indented Nested Subsidiaries', 'Paragraph',
       'Blue & White Table (2 Column)',
       'Table with 2 Subsidiary Name Columns'], dtype=object)

In [76]:
hist_df.loc[:, 'label'] = np.where(hist_df['Layout Type'] == 'Paragraph', 1, 0)

In [77]:
hist_df.head(1)

Unnamed: 0,Filename,Layout Type,Your Initials,full_filename,label
0,38079-0001558370-16-004332,Subsidiary List,KL,edgar/data/38079/0001558370-16-004332.txt,0


In [87]:
input_df = features_df.merge(hist_df[["Filename", "label", "full_filename"]], how="left", left_index=True, right_on="Filename").reset_index(drop=True)

In [88]:
input_df.columns

Index(['n_bboxes', 'block_density', 'avg_y_distance', 'std_y_distance',
       'avg_x_distance', 'std_x_distance', 'Filename', 'label',
       'full_filename'],
      dtype='object')

Train a classifier

In [85]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [89]:
X = input_df.drop(columns=["label", "full_filename", "Filename"])
y = input_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         6

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

