In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import os
from pathlib import Path

import pandas as pd

from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive

In [3]:
archive = GCSArchive()
md = archive.get_metadata()

In [4]:
md

Unnamed: 0_level_0,cik,company_name,form_type,date_filed,exhibit_21_version,year_quarter
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
edgar/data/17206/0000017206-94-000007.txt,17206,CAPITAL HOLDING CORP,10-K/A,1993-12-22,,1993q4
edgar/data/29082/0000950131-94-000021.txt,29082,DISNEY WALT CO,10-K,1993-12-22,21,1993q4
edgar/data/32377/0000032377-94-000001.txt,32377,ELIZABETHTOWN GAS CO,10-K,1993-12-13,21,1993q4
edgar/data/353944/0000353944-94-000005.txt,353944,INTERNATIONAL GAME TECHNOLOGY,10-K,1993-12-23,21,1993q4
edgar/data/60512/0000060512-94-000006.txt,60512,LOUISIANA LAND & EXPLORATION CO,10-K/A,1993-10-07,,1993q4
...,...,...,...,...,...,...
edgar/data/932021/0001493152-23-046428.txt,932021,GLOBAL TECHNOLOGIES LTD,10-K,2023-12-29,21.1,2023q4
edgar/data/933974/0001558370-23-019262.txt,933974,"Azenta, Inc.",10-K,2023-11-21,21.0,2023q4
edgar/data/935419/0001628280-23-041580.txt,935419,"RCI HOSPITALITY HOLDINGS, INC.",10-K,2023-12-14,21.1,2023q4
edgar/data/936395/0000936395-23-000044.txt,936395,CIENA CORP,10-K,2023-12-15,21.1,2023q4


In [5]:
df = pd.DataFrame()
dir_name = Path("paragraph_layout_md")
for filename in os.listdir(dir_name):
    if filename.split(".")[-1] != "parquet":
        continue
    yq_df = pd.read_parquet(dir_name / filename)
    df = pd.concat([df, yq_df])

In [6]:
df

Unnamed: 0,paragraph
1011174-0001193125-10-030674,False
1010612-0000950123-10-019499,False
1003410-0001193125-10-046549,True
1011308-0000921895-10-000357,True
1009672-0000950123-10-018301,True
...,...
898293-0000950144-04-010550,False
894490-0001193125-04-212822,False
930803-0000950136-04-004585,False
893430-0001193125-04-212647,False


In [7]:
df.loc[:, "full_filename"] = "edgar/data/" + df.index.str.replace('-', '/', n=1) + ".txt"

In [8]:
md["date_filed"] = md["date_filed"].astype("datetime64[ns]")

In [9]:
df.head(2)

Unnamed: 0,paragraph,full_filename
1011174-0001193125-10-030674,False,edgar/data/1011174/0001193125-10-030674.txt
1010612-0000950123-10-019499,False,edgar/data/1010612/0000950123-10-019499.txt


In [10]:
# what percentage of files are paragraph layout?
md_merged = md.reset_index().merge(df, left_on="filename", right_on="full_filename", how="left", validate="1:1")
md_merged = md_merged.dropna(subset="paragraph")
len(md_merged[md_merged.paragraph])/len(md_merged)

0.27785882162249775

In [11]:
md_merged.head(2)

Unnamed: 0,filename,cik,company_name,form_type,date_filed,exhibit_21_version,year_quarter,paragraph,full_filename
6,edgar/data/100240/0000950144-94-000787.txt,100240,TURNER BROADCASTING SYSTEM INC,10-K,1994-03-31,21,1994q1,False,edgar/data/100240/0000950144-94-000787.txt
11,edgar/data/100885/0000100885-94-000006.txt,100885,UNION PACIFIC CORP,10-K,1994-03-29,21,1994q1,False,edgar/data/100885/0000100885-94-000006.txt


In [12]:
# what percentage of CIKs are only covered by paragraph layout docs
# get the set of unique CIKs in md_merged
all_ciks = set(md_merged.cik)
# remove the paragraph layout docs
no_paragraph_ciks = set(md_merged[md_merged["paragraph"] == False].cik)
# get the set of CIKs that are in the full set but not the paragraph removed set
only_paragraph_ciks = all_ciks - no_paragraph_ciks
# divide that number by the total number of CIKs
len(only_paragraph_ciks)/len(all_ciks)

0.10292571287189956

In [13]:
len(only_paragraph_ciks)

1664

In [None]:
# what percentage of CIK and year-quarter coverage do we get if we exclude all paragraph filings