In [71]:
import os
import re
import time as t
import pandas as pd
import xml.etree.ElementTree as ET


In [73]:
# Helper function to safely get text from an XML element
def get_text(element, path, namespaces):
    """Helper function to safely get text from an XML element."""
    if element is not None:
        found_element = element.find(path, namespaces)
        if found_element is not None:
            return found_element.text
    return None

# Helper function to get all texts from elements matching a path
def get_all_texts(element, path, namespaces):
    """Helper function to get all texts from elements matching a path."""
    texts = []
    if element is not None:
        for found_element in element.findall(path, namespaces):
            if found_element is not None and found_element.text is not None:
                texts.append(found_element.text)
    return texts


In [75]:
# Function to extract form type from the SEC-HEADER part of the .txt file
def extract_form_type_from_header(full_txt_content):
    """
    Extracts the CONFORMED SUBMISSION TYPE or FORM TYPE from the SEC-HEADER.
    """
    match_conformed = re.search(r'CONFORMED SUBMISSION TYPE:\s*([A-Z0-9-]+)', full_txt_content)
    if match_conformed:
        return match_conformed.group(1).strip()

    match_form_type = re.search(r'FORM TYPE:\s*([A-Z0-9-]+)', full_txt_content)
    if match_form_type:
        return match_form_type.group(1).strip()

    return None

# NEW HELPER FUNCTION: Function to extract "FILED AS OF DATE" from SEC-HEADER
def extract_filed_as_of_date_from_header(full_txt_content):
    """
    Extracts the 'FILED AS OF DATE' from the SEC-HEADER part of the .txt file.
    """
    match = re.search(r'FILED AS OF DATE:\s*(\d{8})', full_txt_content)
    if match:
        return match.group(1)
    return None


In [77]:
# Function to extract the XML content from a SEC .txt filing
def extract_xml_from_sec_txt_filing(full_txt_content, form_type):
    """
    Extracts the XML content from an SEC .txt filing based on common patterns.
    """
    xml_start_tag = "<XML>"
    xml_end_tag = "</XML>"

    start_idx = full_txt_content.find(xml_start_tag)
    end_idx = full_txt_content.find(xml_end_tag)

    if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
        extracted_xml = full_txt_content[start_idx + len(xml_start_tag) : end_idx]

        xml_declaration_start = extracted_xml.find("<?xml")
        if xml_declaration_start != -1:
            extracted_xml = extracted_xml[xml_declaration_start:]
        else:
            root_tag_match = re.search(r'<\s*([a-zA-Z0-9_.-]+)(?:\s|>|\n)', extracted_xml)
            if root_tag_match:
                first_tag_start = extracted_xml.find(root_tag_match.group(0))
                if first_tag_start != -1:
                    extracted_xml = extracted_xml[first_tag_start:]

        if extracted_xml.strip():
            return extracted_xml
    return None


In [None]:

# Modified extract_data_from_xml function to include header date
def extract_data_from_xml(root, namespaces, file_name='', folder_name='', full_txt_content=''):
    """
    Extracts structured data specifically for Form MA-W from a parsed XML ElementTree root.
    Includes filing dates from both header and XML content.
    Returns an empty DataFrame if the submission type is not MA-W.
    """
    try:
        ns = {
            'mafiler': 'http://www.sec.gov/edgar/mafiler',
            'com': 'http://www.sec.gov/edgar/common_ma',
            'com1': 'http://www.sec.gov/edgar/common',
            'ma': 'http://www.sec.gov/edgar/ma_drp',
            'ma1': 'http://www.sec.gov/edgar/ma_common_drp',
            'com2': 'http://www.sec.gov/edgar/common_drp',
            'mawfiler': 'http://www.sec.gov/edgar/mawfiler'
        }

        header_info = root.find('mawfiler:headerData', ns)
        if header_info is None:
            header_info = root.find('headerData', ns)
            if header_info is None:
                print(f"Warning: Could not find headerData in file {file_name}. Cannot extract common header info.")
                return pd.DataFrame()

        submission_type = get_text(header_info, 'mawfiler:submissionType', ns)
        if not submission_type:
            submission_type = get_text(header_info, 'submissionType', ns)

        if submission_type != 'MA-W':
            print(f"Skipping {file_name}: Form type is '{submission_type}', but this function is configured for MA-W only.")
            return pd.DataFrame()

        filer_info = header_info.find('mawfiler:filerInfo', ns)
        if filer_info is None:
            print(f"Warning: Could not find filerInfo in headerData for MA-W in file {file_name}.")
            filer_info = ET.Element("dummy")

        filer = filer_info.find('com:filer', ns)
        if filer is None:
            print(f"Warning: Could not find filer in filerInfo for MA-W in file {file_name}.")
            filer = ET.Element("dummy")

        contact = filer_info.find('com:contact', ns)
        if contact is None:
            print(f"Warning: Could not find contact in filerInfo for MA-W in file {file_name}.")
            contact = ET.Element("dummy")

        data = {
            'submissionType': submission_type,
            'filerId': get_text(filer, 'com1:filerId', ns),
            'filerCcc': get_text(filer, 'com1:filerCcc', ns),
            'filerFileNumber': get_text(filer, 'com1:filerFileNumber', ns),
            'contactName': get_text(contact, 'com1:name', ns),
            'contactPhoneNumber': get_text(contact, 'com1:phoneNumber', ns),
            'contactEmail': get_text(filer_info, 'com:contactEmail', ns),
            'notificationEmails': get_all_texts(filer_info, 'com:notifications/com1:internetNotificationAddress', ns),
            'filing_date_xml': get_text(root.find('mawfiler:formData', ns), 'mawfiler:execution/mawfiler:muncipalAdvisoryFirm/com1:date', ns), # Suffix for XML source
            'filed_as_of_date_header': extract_filed_as_of_date_from_header(full_txt_content) # Suffix for header source
        }

        form_data_root = root.find('mawfiler:formData', ns)
        if form_data_root is None:
            print(f"Warning: Could not find formData for MA-W in file {file_name}.")
            return pd.DataFrame()

        data.update({
            'fullLegalName': get_text(form_data_root, 'mawfiler:fullLegalName', ns),
            'fileNumber_formData': get_text(form_data_root, 'mawfiler:fileNumber', ns),
            'contactPerson_firstName': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:individualName/com:firstName', ns),
            'contactPerson_middleName': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:individualName/com:middleName', ns),
            'contactPerson_lastName': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:individualName/com:lastName', ns),
            'contactPerson_street1': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:street1', ns),
            'contactPerson_city': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:city', ns),
            'contactPerson_stateOrCountry': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:stateOrCountry', ns),
            'contactPerson_zipCode': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:zipCode', ns),
            'contactPerson_phoneNumber': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:nameAddressPhone/mawfiler:phoneNumber', ns), # RESTORED TO ORIGINAL WORKING PATH
            'contactPerson_title': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:title', ns),
            'contactPerson_email': get_text(form_data_root, 'mawfiler:contactPersonInfo/mawfiler:email', ns),
            'isAdvisoryContract': get_text(form_data_root, 'mawfiler:isAdvisoryContract', ns),
            'isUnsatisfiedJudgementsOrLiens': get_text(form_data_root, 'mawfiler:isUnsatisfiedJudgementsOrLiens', ns),
            'isReceivedAnyPrepaidFee': get_text(form_data_root, 'mawfiler:isReceivedAnyPrepaidFee', ns),
            'isBorrowedNotRepaid': get_text(form_data_root, 'mawfiler:isBorrowedNotRepaid', ns),
            'booksAndRecords_personName': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:name', ns),
            'booksAndRecords_personStreet1': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:addressInfo/com:address/com1:street1', ns),
            'booksAndRecords_personCity': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:addressInfo/com:address/com1:city', ns),
            'booksAndRecords_personStateOrCountry': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:addressInfo/com:address/com1:stateOrCountry', ns),
            'booksAndRecords_personZipCode': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:addressInfo/com:address/com1:zipCode', ns),
            'booksAndRecords_personPhoneNumber': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:personInfo/mawfiler:phoneNumber', ns),
            'booksAndRecords_locationName': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:name', ns),
            'booksAndRecords_locationStreet1': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:street1', ns),
            'booksAndRecords_locationCity': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:city', ns),
            'booksAndRecords_locationStateOrCountry': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:stateOrCountry', ns),
            'booksAndRecords_locationZipCode': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:addressInfo/com:address/com1:zipCode', ns),
            'booksAndRecords_locationPhoneNumber': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:nameAddressPhone/mawfiler:phoneNumber', ns),
            'booksAndRecords_description': get_text(form_data_root, 'mawfiler:booksAndRecords/mawfiler:personLocation/mawfiler:locationInfo/mawfiler:description', ns),
            'execution_signature': get_text(form_data_root, 'mawfiler:execution/mawfiler:muncipalAdvisoryFirm/com1:signature', ns),
            'execution_signerName': get_text(form_data_root, 'mawfiler:execution/mawfiler:muncipalAdvisoryFirm/com1:signerName', ns),
            'execution_title': get_text(form_data_root, 'mawfiler:execution/mawfiler:muncipalAdvisoryFirm/com1:title', ns),
        })

        data['file_name'] = file_name
        data['folder_name'] = folder_name

        return pd.json_normalize(data, sep='_')

    except Exception as e:
        print(f'Error during XML parsing in file {file_name}: {str(e)}')
        return pd.DataFrame()


In [None]:
# Main execution block
if __name__ == "__main__":
    namespaces = {
        'mafiler': 'http://www.sec.gov/edgar/mafiler',
        'com': 'http://www.sec.gov/edgar/common_ma',
        'com1': 'http://www.sec.gov/edgar/common',
        'ma': 'http://www.sec.gov/edgar/ma_drp',
        'ma1': 'http://www.sec.gov/edgar/ma_common_drp',
        'com2': 'http://www.sec.gov/edgar/common_drp',
        'mawfiler': 'http://www.sec.gov/edgar/mawfiler'
    }

    root_folder_to_scan = r"D:\PhD_Fin\Muni_adv_PBF\Data2"
    output_csv_path = os.path.join(root_folder_to_scan, "sec_form_ma_w.csv")

    all_dfs = []
    processed_files_count = 0
    total_files_to_process = 0

    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        for filename in filenames:
            if filename.endswith(".txt"):
                total_files_to_process += 1

    print(f"Starting to process {total_files_to_process} .txt files in '{root_folder_to_scan}'...")
    start_time = t.time()

    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        for filename in filenames:
            if filename.endswith(".txt"):
                processed_files_count += 1
                txt_file_path = os.path.join(dirpath, filename)
                print(f"Processing file: {txt_file_path} ({processed_files_count}/{total_files_to_process})")

                extracted_xml_content = None

                try:
                    with open(txt_file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        full_txt_content = f.read()

                    form_type = extract_form_type_from_header(full_txt_content)

                    if not form_type:
                        print(f"Skipping {filename}: Could not determine FORM TYPE from header.")
                        continue

                    if form_type != 'MA-W':
                        print(f"Skipping {filename}: Form type '{form_type}' is not MA-W, this script processes MA-W only.")
                        continue

                    extracted_xml_content = extract_xml_from_sec_txt_filing(full_txt_content, form_type)

                    if extracted_xml_content:
                        temp_xml_file_path = os.path.join(dirpath, os.path.splitext(filename)[0] + "_temp_parsed.xml")
                        with open(temp_xml_file_path, 'w', encoding='utf-8') as f_out:
                            f_out.write(extracted_xml_content)

                        tree = ET.parse(temp_xml_file_path, parser=ET.XMLParser(encoding='utf-8'))
                        root1 = tree.getroot()

                        extracted_data = extract_data_from_xml(root1, namespaces, file_name=filename, folder_name=os.path.basename(dirpath), full_txt_content=full_txt_content)

                        if not extracted_data.empty:
                            all_dfs.append(extracted_data)
                            print(f"Successfully extracted data from {filename} (Form Type: {form_type}). DataFrame shape: {extracted_data.shape}")
                        else:
                            print(f"No structured data extracted from {filename} (Form Type: {form_type}) despite XML content being found.")

                    else:
                        print(f"No valid XML content (starting with <?xml?> and containing <edgarSubmission>...</edgarSubmission>) extracted from {filename} for form type '{form_type}'.")

                except ET.ParseError as e:
                    print(f"XML Parse Error in {txt_file_path} (after extraction): {str(e)}")
                except Exception as e:
                    print(f"An unexpected error occurred while processing {txt_file_path}: {str(e)}")
                finally:
                    if 'temp_xml_file_path' in locals() and os.path.exists(temp_xml_file_path):
                        os.remove(temp_xml_file_path)

    print(f"\nFinished processing all files.")
    print(f"Number of DataFrames collected: {len(all_dfs)}")

    if all_dfs:
        start_concat = t.time()
        combined_df = pd.concat(all_dfs, ignore_index=True)
        end_concat = t.time()
        print(f"Time to concatenate DataFrames: {end_concat - start_concat:.2f} seconds")

        total_processing_time = end_concat - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print(f"Combined DataFrame shape: {combined_df.shape}")

        combined_df.to_csv(output_csv_path, index=False)
        print(f"Combined data saved to {output_csv_path}")

    else:
        total_processing_time = t.time() - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print("No data extracted from any files to form a combined DataFrame.")

Starting to process 31399 .txt files in 'D:\PhD_Fin\Muni_adv_PBF\Data2'...
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000018.txt (1/31399)
Skipping MA_0000009211_14_000018.txt: Form type 'MA' is not MA-W, this script processes MA-W only.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000019.txt (2/31399)
Skipping MA_0000009211_14_000019.txt: Form type 'MA-I' is not MA-W, this script processes MA-W only.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000020.txt (3/31399)
Skipping MA_0000009211_14_000020.txt: Form type 'MA-I' is not MA-W, this script processes MA-W only.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000021.txt (4/31399)
Skipping MA_0000009211_14_000021.txt: Form type 'MA-I' is not MA-W, this script processes MA-W only.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000022.txt (5/31399)
Skipping MA_0000009211_14_000022.txt: Form 

In [67]:
combined_df

Unnamed: 0,submissionType,filerId,filerCcc,filerFileNumber,contactName,contactPhoneNumber,contactEmail,notificationEmails,filing_date_xml,filed_as_of_date_header,...,booksAndRecords_locationCity,booksAndRecords_locationStateOrCountry,booksAndRecords_locationZipCode,booksAndRecords_locationPhoneNumber,booksAndRecords_description,execution_signature,execution_signerName,execution_title,file_name,folder_name
0,MA-W,0000012518,XXXXXXXX,867-00476,Michael E Robbins,3123645319,mrobbins@williamblair.com,"[eyeomans@williamblair.com, kwagner@williambla...",09-25-2017,20170925,...,Chicago,IL,60606,3122361600,All of William Blair's municipal advisory book...,Kenneth L. Wagner,Kenneth L. Wagner,Partner & Chief Compliance Officer,MA_0000012518_17_000049.txt,0000012518
1,MA-W,0000012933,XXXXXXXX,867-01153,Glenn Essert,6108325309,gessert@boenninginc.com,[gessert@boenninginc.com],06-17-2022,20220617,...,West Conshohocken,PA,19428,61083212112,"Invoices, engagement letters and Municipal Adv...",Glenn Edward Essert,Glenn Edward Essert,Chief Compliance Officer,MA_0000012933_22_000012.txt,0000012933
2,MA-W,0000025142,XXXXXXXX,867-01669,Peter Mirsky,3038631900,peter@coughlinandcompany.com,[peter@coughlinandcompany.com],08-08-2024,20240808,...,Denver,CO,80203,3038631900,"Old tax returns, entity documents, files for a...",Peter Mirsky,Peter Mirsky,CFO,MA_0000025142_24_000003.txt,0000025142
3,MA-W,0000036528,XXXXXXXX,867-00349,Paul D. Brown,3098293311,paul@firstmidstate.com,[pat@firstmidstate.com],07-05-2023,20230705,...,Bloomington,IL,61701,3098306697,all records,Paul D. Brown,Paul D. Brown,President,MA_0000036528_23_000004.txt,0000036528
4,MA-W,0000049709,XXXXXXXX,867-01097,Jacob Bongard,612-671-0527,jacob.x.bongard@ampf.com,[jacob.x.bongard@ampf.com],08-29-2017,20170829,...,EAGAN,MN,55121-2203,651-452-6515,IRON MOUNTAIN STORES CERTAIN BOOKS AND RECORDS...,Elizabeth Hansen,Elizabeth Hansen,Vice President & Chief Compliance Officer,MA_0000049709_17_000039.txt,0000049709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,MA-W,0001956887,XXXXXXXX,867-02652,NICOLAS NORBOGE,9842915825,nnorboge@daneconsultingpartners.com,[nnorboge@daneconsultingpartners.com],01-03-2025,20250103,...,Durham,NC,27713,979-739-6463,All records and books are kept at the home off...,NICOLAS D NORBOGE,NICOLAS D NORBOGE,Managing Principal and Chief Compliance Officer,MA_0001956887_25_000001.txt,0001956887
446,MA-W,0001973850,XXXXXXXX,867-02640,Joe Crowley,213-700-2757,joecrowley@pacificpfa.com,[joecrowley@pacificpfa.com],10-06-2023,20231006,...,ALTADENA,CA,91001,213-700-2757,Files,Joseph Crowley,Joseph Crowley,Principal,MA_0001973850_23_000005.txt,0001973850
447,MA-W,0001977510,XXXXXXXX,867-02644,Katherine L Clupper,2158503942,klclupper@gmail.com,[klclupper@gmail.com],04-26-2024,20240426,...,Philadelphia,PA,19130,2158503942,"LLC Incorporation documents, bank documents, a...",Katherine L Clupper,Katherine L Clupper,Owner,MA_0001977510_24_000003.txt,0001977510
448,MA-W,0002006358,XXXXXXXX,867-02678,Smiley Arthur Chester III,2819740467,acs716@gmail.com,[],,20241121,...,Humble,TX,77396,2819740467,"Formation documents, bank account records",,,,MA_0002006358_24_000009.txt,0002006358


In [69]:
#check missings

missing_counts = combined_df.isnull().sum()
print("Missings = \n", missing_counts)

missing_pct = (missing_counts / len(combined_df))*100
print("Missing Pct = \n", missing_pct)

Missings = 
 submissionType                              0
filerId                                     0
filerCcc                                    0
filerFileNumber                             0
contactName                                 0
contactPhoneNumber                          0
contactEmail                                0
notificationEmails                          0
filing_date_xml                            66
filed_as_of_date_header                     0
fullLegalName                               0
fileNumber_formData                         0
contactPerson_firstName                     0
contactPerson_middleName                    0
contactPerson_lastName                      0
contactPerson_street1                       0
contactPerson_city                          0
contactPerson_stateOrCountry                0
contactPerson_zipCode                       0
contactPerson_phoneNumber                   0
contactPerson_title                        46
contactPerson_email  

In [79]:
print(combined_df.columns)

Index(['submissionType', 'filerId', 'filerCcc', 'filerFileNumber',
       'contactName', 'contactPhoneNumber', 'contactEmail',
       'notificationEmails', 'filing_date_xml', 'filed_as_of_date_header',
       'fullLegalName', 'fileNumber_formData', 'contactPerson_firstName',
       'contactPerson_middleName', 'contactPerson_lastName',
       'contactPerson_street1', 'contactPerson_city',
       'contactPerson_stateOrCountry', 'contactPerson_zipCode',
       'contactPerson_phoneNumber', 'contactPerson_title',
       'contactPerson_email', 'isAdvisoryContract',
       'isUnsatisfiedJudgementsOrLiens', 'isReceivedAnyPrepaidFee',
       'isBorrowedNotRepaid', 'booksAndRecords_personName',
       'booksAndRecords_personStreet1', 'booksAndRecords_personCity',
       'booksAndRecords_personStateOrCountry', 'booksAndRecords_personZipCode',
       'booksAndRecords_personPhoneNumber', 'booksAndRecords_locationName',
       'booksAndRecords_locationStreet1', 'booksAndRecords_locationCity',
      