In [1]:
%load_ext autoreload
%autoreload 3

In [92]:
from importlib import resources
import os
from pathlib import Path

import numpy as np
import pandas as pd

from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes
from mozilla_sec_eia.models.sec10k.utils.pdf import get_pdf_data_from_path

Create a directory with all of the validation docs

In [93]:
hist_filename = "ex21_layout_histogram.csv"
hist_df = pd.read_csv(
        resources.files("mozilla_sec_eia.package_data.validation_data")
        / hist_filename,
        comment="#",
    )

In [94]:
hist_df.loc[:, "full_filename"] = "edgar/data/" + hist_df["Filename"].str.split("-", expand=True)[0] + "/" + hist_df["Filename"].str.split("-", expand=True, n=1)[1] + ".txt"

In [95]:
archive = GCSArchive()
md = archive.get_metadata()

In [96]:
pdfs_dir = Path("../sec10k_filings/validation_filings")

In [97]:
archive.get_filings(md.loc[hist_df["full_filename"]], cache_directory=pdfs_dir, cache_pdf=True)

<table> is empty
'<c> 900 West Park Drive LLC Delaware Astrum Software Corporation Delaware Australian Outsourcing Pty Limited Australia Avalon Consulting Group California Boxcar Software, Inc. Delaware Datagen, Inc. Delaware Data General Argentina S.A. Argentina Data General Australia Pty. Ltd. Australia Data General BVI, Ltd. British Virgin Islands Data General Computers Sdn Bhd'
<table> is empty
'<table> </table>'
<table> is empty
'<table> </table>'
<table> is empty
'<c> Mid Ocean Limited -- Cayman Islands Mid Ocean Holdings Ltd. 100 Bermuda Mid Ocean Reinsurance Company Ltd. 100 Bermuda Baltusrol Holdings Limited 51 Bermuda Mid Ocean Reinsurance Consulting GmbH 100 Germany Ridgewood Holdings Ltd. 100 Bermuda The Brockbank Group plc 100 United Kingdom Brockbank Holdings Limited 100 United Kingdom'
<table> is empty
'<c> HNC Software Inc., U.K. United Kingdom 100% HNC Software Inc., Japan Japan 100% Neil Thall Associates, Inc. * Georgia 100% Retek Information Systems, Inc. * Nevada 10

Get the documents in "OCR'ed" format, with bounding boxes and text. Then, create features for each document that we can train a model on.

Add ons:
* We could run this post LayoutLM inference, and filter for just bounding boxes that have entity labels, then create features.

In [262]:
def calculate_features(df):
    features = {}
    # features['n_bboxes'] = len(df)
    
    # block density wasn't a very useful feature, maybe rework?
    # Calculate the bounding box density of the area of the page with text
    # x_width = df["bottom_right_x_pdf"].max() - df["top_left_x_pdf"].min()
    y_height = df["bottom_right_y_pdf"].max() - df["top_left_y_pdf"].min()
    # text_area = x_width * y_height
    features["block_y_density"] = len(df) / y_height
    
    # Calculate average y-distance between bounding boxes for a given document
    df = df.sort_values(by=['top_left_y_pdf', 'top_left_x_pdf'])
    y_diffs = df['top_left_y_pdf'].diff().dropna()
    features['avg_y_distance'] = y_diffs.mean()
    features['std_y_distance'] = y_diffs.std()

    # Define a small threshold to group bounding boxes that are on the same line
    y_threshold = 0.5
    df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()
    
    # Calculate x-distance to assess horizontal alignment
    x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())
    # x_diffs = df["top_left_x_pdf"].diff().dropna()
    features['avg_x_distance'] = x_diffs.mean()
    # features['std_x_distance'] = x_diffs.std()
    
    boxes_per_line = df.groupby('line_group').size()
    features['median_boxes_per_line'] = boxes_per_line.median()
    
    #df['line_width'] = df.groupby('line_group')['bottom_right_x_pdf'].transform(lambda x: x.max() - x.min())
    #features['mean_line_width'] = df['line_width'].mean()

    return features

In [263]:
features_df = pd.DataFrame()
for pdf_filename in os.listdir(pdfs_dir):
    if pdf_filename.split(".")[-1] != "pdf":
        continue
    src_path = pdfs_dir / pdf_filename
    extracted, pg = get_pdf_data_from_path(src_path)
    txt = extracted["pdf_text"]
    pg_meta = extracted["page"]
    txt_df = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
    if txt_df.empty:
        continue
    features = calculate_features(txt_df)
    features_df = pd.concat([features_df, pd.DataFrame(index=[pdf_filename.split(".")[0]], data=features)])

In [264]:
features_df.head(1)

Unnamed: 0,block_y_density,avg_y_distance,std_y_distance,avg_x_distance,median_boxes_per_line
1739566-0001739566-21-000088,0.322609,3.012425,8.213566,101.908806,5.0


Create y labels from the layout histogram

In [226]:
hist_df["Layout Type"].unique()

array(['Subsidiary List', 'Blue & White Table (3 Column)',
       'List with Sentences', 'Generic Table',
       'List with Indented Nested Subsidiaries', 'Paragraph',
       'Blue & White Table (2 Column)',
       'Table with 2 Subsidiary Name Columns'], dtype=object)

In [254]:
hist_df.loc[:, 'label'] = np.where(hist_df['Layout Type'] == 'Paragraph', 1, 0)

In [255]:
hist_df.head(1)

Unnamed: 0,Filename,Layout Type,Your Initials,full_filename,label
0,38079-0001558370-16-004332,Subsidiary List,KL,edgar/data/38079/0001558370-16-004332.txt,0


In [265]:
input_df = features_df.merge(hist_df[["Filename", "label", "full_filename"]], how="left", left_index=True, right_on="Filename").reset_index(drop=True)

In [209]:
input_df.columns

Index(['block_y_density', 'avg_y_distance', 'std_y_distance', 'avg_x_distance',
       'std_x_distance', 'median_boxes_per_line', 'Filename', 'label',
       'full_filename'],
      dtype='object')

Train a classifier

In [249]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [279]:
X = input_df.drop(columns=["label", "full_filename", "Filename"])
y = input_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
model = SVC(kernel='linear')
model.fit(X_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       0.92      0.92      0.92        12

    accuracy                           0.94        33
   macro avg       0.93      0.93      0.93        33
weighted avg       0.94      0.94      0.94        33



# Scratch: feature selection and cross validation of models

In [278]:
results_df = X_test.merge(input_df[["Filename", "label"]], how="left", left_index=True, right_index=True)
results_df.loc[:, "pred_label"] = y_pred

In [280]:
results_df[results_df.label != results_df.pred_label]

Unnamed: 0,block_y_density,avg_y_distance,std_y_distance,avg_x_distance,median_boxes_per_line,Filename,label,pred_label
66,0.72974,1.290955,5.511497,28.687408,16.5,320575-0001193125-07-117419,0,1
26,0.229182,3.953266,12.485551,41.435265,3.0,1093672-0001654954-23-003112,1,0


In [266]:
from sklearn.linear_model import LogisticRegression

X = input_df.drop(columns=["label", "full_filename", "Filename"])
y = input_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
model = LogisticRegression()
model.fit(X_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       0.92      0.92      0.92        12

    accuracy                           0.94        33
   macro avg       0.93      0.93      0.93        33
weighted avg       0.94      0.94      0.94        33



In [267]:
coefficients = model.coef_[0]

# Create a dataframe to display feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

In [268]:
feature_importance

Unnamed: 0,Feature,Coefficient
4,median_boxes_per_line,2.087307
0,block_y_density,0.484606
2,std_y_distance,-0.692503
1,avg_y_distance,-0.810105
3,avg_x_distance,-0.88044


Random Forest

In [269]:
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=16)
rf.fit(X_scaled, y)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)

# Sort and display feature importances
important_features = feature_importances.sort_values(ascending=False)
important_features

median_boxes_per_line    0.396481
avg_y_distance           0.250036
block_y_density          0.193412
std_y_distance           0.119810
avg_x_distance           0.040261
dtype: float64

RFE with Logistic Regression

In [270]:
from sklearn.feature_selection import RFE

# Create a Logistic Regression model
lr = LogisticRegression()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Initialize Recursive Feature Elimination (RFE)
rfe = RFE(lr, n_features_to_select=4) 
rfe.fit(X_scaled, y)

# Get the ranking of the features
feature_ranking = pd.Series(rfe.ranking_, index=X.columns)

# Features selected by RFE
selected_features = feature_ranking[feature_ranking == 1].index.tolist()
print("Selected features:", selected_features)


Selected features: ['block_y_density', 'avg_y_distance', 'avg_x_distance', 'median_boxes_per_line']


Cross Validate different classifiers

In [272]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(kernel='linear')
}

# Create a scoring function (use accuracy, F1-score, etc.)
scorer = make_scorer(f1_score)

# Dictionary to store cross-validation results
cv_results = {}

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform cross-validation for each classifier
for name, clf in classifiers.items():
    # Perform 5-fold cross-validation
    scores = cross_val_score(clf, X_scaled, y, cv=5, scoring=scorer)
    
    # Store the mean score and standard deviation
    cv_results[name] = {
        'mean_f1': np.mean(scores),
        'std_f1': np.std(scores)
    }

# Print comparison of classifiers
for clf_name, scores in cv_results.items():
    print(f"{clf_name}: Mean F1-Score = {scores['mean_f1']:.4f}, Std Dev = {scores['std_f1']:.4f}")


Logistic Regression: Mean F1-Score = 0.9616, Std Dev = 0.0315
Random Forest: Mean F1-Score = 0.9499, Std Dev = 0.0469
SVM: Mean F1-Score = 0.9749, Std Dev = 0.0308
