This file is for parsing the txt file into a csv file for analysis.

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import shutil

# File paths
input_path = "../data/raw/lm_dictionaries/Loughran-McDonald_10X_DocumentDictionaries_1993-2024.txt"
csv_output_path = "../data/processed/full_docdict.csv"
zip_output_path = "../data/processed/full_docdict"  # no extension here for make_archive

# Make sure output folder exists
os.makedirs("../data/processed", exist_ok=True)


This function is to decode the txt file structure for the parsing loop.

In [2]:
def parse_docdict_line(line):
    header_str, wordcount_str = line.strip().split('|', maxsplit=1)
    header_parts = header_str.split(',')
    word_counts = wordcount_str.split(',')

    header = {
        "cik": header_parts[0],
        "filing_date": header_parts[1],
        "accession_number": header_parts[2],
        "report_date": header_parts[3],
        "form_type": header_parts[4],
        "company_name": header_parts[5]
    }

    word_dict = {}
    for pair in word_counts:
        if ':' in pair:
            idx, count = pair.split(':')
            word_dict[f'word_{int(idx)}'] = int(count)

    return header, word_dict


This is the loop that processes the full dictionary into a csv file. It writes processed data in chinks to not overload the memory and crash the kernel.

In [None]:
batch_size = 10000
records = []
header_written = False
total_rows = 0

with open(input_path, "r", encoding="utf-8") as f_in:
    for line in tqdm(f_in, desc="Parsing and writing full filings"):
        header, word_data = parse_docdict_line(line)
        row = {**header, **word_data}
        records.append(row)

        if len(records) >= batch_size:
            df_batch = pd.DataFrame(records)

            if not header_written:
                df_batch.to_csv(csv_output_path, index=False, mode='w')
                header_written = True
            else:
                df_batch.to_csv(csv_output_path, index=False, mode='a', header=False)

            total_rows += df_batch.shape[0]
            print(f"Wrote batch: {df_batch.shape[0]} rows, {df_batch.shape[1]} columns")
            print(f"Total rows written so far: {total_rows}")

            records = []  # clear memory

            
# Write any remaining rows
if records:
    df_batch = pd.DataFrame(records)
    if not header_written:
        df_batch.to_csv(csv_output_path, index=False, mode='w')
    else:
        df_batch.to_csv(csv_output_path, index=False, mode='a', header=False)

    total_rows += df_batch.shape[0]
    print(f"Wrote final batch: {df_batch.shape[0]} rows, {df_batch.shape[1]} columns")
    print(f"Total rows written after final batch: {total_rows}")

print("Parsing and saving complete.")


Parsing and writing full filings: 0it [00:00, ?it/s]

Wrote batch: 10000 rows, 33960 columns
Total rows written so far: 10000
Wrote batch: 10000 rows, 35354 columns
Total rows written so far: 20000
Wrote batch: 10000 rows, 30662 columns
Total rows written so far: 30000
Wrote batch: 10000 rows, 38845 columns
Total rows written so far: 40000
Wrote batch: 10000 rows, 32015 columns
Total rows written so far: 50000
Wrote batch: 10000 rows, 31070 columns
Total rows written so far: 60000
Wrote batch: 10000 rows, 34647 columns
Total rows written so far: 70000
Wrote batch: 10000 rows, 34580 columns
Total rows written so far: 80000
Wrote batch: 10000 rows, 42536 columns
Total rows written so far: 90000
Wrote batch: 10000 rows, 33583 columns
Total rows written so far: 100000


Zip the final .csv for faster processing in the future

In [None]:
# Zip the final CSV
if os.path.exists(csv_output_path) and not os.path.exists(zip_output_path + ".zip"):
    shutil.make_archive(zip_output_path, 'zip', root_dir="../data/processed", base_dir="full_docdict.csv")
    print("CSV zipped successfully.")
