In [1]:
import os
import pandas as pd

# File paths as plain strings
input_path = "../data/raw/lm_dictionaries/Loughran-McDonald_10X_DocumentDictionaries_1993-2024.txt"
output_path = "../temp/sample_docdict.csv"

# Ensure output folder exists
os.makedirs("temp", exist_ok=True)


### Creating a sample

In [2]:
# CIKs of 5 companies: Apple, Microsoft, Amazon, Google, Johnson & Johnson
target_ciks = {"320193", "789019", "1018724", "1652044", "200406"}

# Target filing years
start_year = 2012
end_year = 2016

In [3]:
def parse_docdict_line(line):
    header_str, wordcount_str = line.strip().split('|', maxsplit=1)
    header_parts = header_str.split(',')
    word_counts = wordcount_str.split(',')

    header = {
        "cik": header_parts[0],
        "filing_date": header_parts[1],
        "accession_number": header_parts[2],
        "report_date": header_parts[3],
        "form_type": header_parts[4],
        "company_name": header_parts[5]
    }

    word_dict = {}
    for pair in word_counts:
        if ':' in pair:
            idx, count = pair.split(':')
            word_dict[f'word_{int(idx)}'] = int(count)

    return header, word_dict


def is_target_filing(header):
    year = int(header["filing_date"][:4])
    return header["cik"] in target_ciks and start_year <= year <= end_year


#### Build sample dataframe

In [5]:
from tqdm import tqdm

records = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Reading filings"):
        header, word_data = parse_docdict_line(line)
        if is_target_filing(header):
            row = {**header, **word_data}
            records.append(row)

df = pd.DataFrame(records)
df.to_csv(output_path, index=False)
df.shape


Reading filings: 1224508it [18:14, 1118.42it/s]


(87, 8438)

In [None]:
df
