In [11]:
import pandas as pd
import xml.etree.ElementTree as ET
import os
import time as t
import re
from typing import Union


In [13]:
# Helper functions (get_text, get_all_texts) remain the same
def get_text(element, path, namespaces=None):
    """Safely gets text from an XML element found by path."""
    if element is None:
        return ''
    found = element.find(path, namespaces)
    return found.text if found is not None else ''

def get_all_texts(element, path, namespaces=None):
    """Safely gets all texts from multiple XML elements found by path."""
    if element is None:
        return []
    return [e.text for e in element.findall(path, namespaces) if e is not None and e.text]


In [15]:
def extract_xml_from_sec_txt_filing(txt_content: str, form_type: str) -> Union[str, None]:
    """
    Extracts the pure XML content (starting with <?xml...?> and ending with </edgarSubmission>)
    for a specific form type from an SEC .txt filing's full text content.
    """
    # First, find the block containing the desired XML within the <DOCUMENT><TEXT> section.
    # The pattern is made robust to handle variations like missing </TYPE> closing tag.
    text_block_pattern = r"<DOCUMENT>.*?<TYPE>\s*" + re.escape(form_type) + r".*?<TEXT>(.*?)</TEXT>\s*</DOCUMENT>"
    
    text_block_match = re.search(text_block_pattern, txt_content, re.DOTALL | re.IGNORECASE)
    
    if text_block_match:
        raw_content_within_text_tag = text_block_match.group(1)
        
        # Now, within this raw content, precisely find the actual XML string
        # from the <?xml declaration up to the closing </edgarSubmission> tag.
        # This will correctly exclude any outer <XML> or other non-XML wrappers.
        # Added a non-greedy match for the root element content to handle cases where
        # there might be other tags or whitespace after the root element.
        final_xml_pattern = r"<\?xml(?:[^\"'>]|\"[^\"]*\"|'[^']*')*?\?>\s*(<edgarSubmission.*?</edgarSubmission>)"
        
        xml_content_match = re.search(final_xml_pattern, raw_content_within_text_tag, re.DOTALL | re.IGNORECASE)
        
        if xml_content_match:
            # Return the matched XML content, removing any leading/trailing whitespace
            return xml_content_match.group(0).strip()
        else:
            # print(f"Warning: No complete XML document (starting with <?xml?> and containing <edgarSubmission>...</edgarSubmission>) found within the <TEXT> block for form type '{form_type}'.")
            return None
    else:
        # print(f"Warning: <DOCUMENT> block for form type '{form_type}' not found in the provided text content.")
        return None


In [17]:
def extract_form_type_from_header(txt_content: str) -> Union[str, None]:
    """
    Extracts the FORM TYPE from the SEC-HEADER section of a .txt filing.
    """
    # Regex to find FORM TYPE in SEC-HEADER
    # It looks for "FORM TYPE:" followed by optional whitespace and then captures the type.
    # Uses re.MULTILINE to match '^' at the start of each line.
    form_type_pattern = r"^\s*FORM TYPE:\s*([A-Za-z0-9\.-]+)"
    
    sec_header_match = re.search(r"<SEC-HEADER>(.*?)</SEC-HEADER>", txt_content, re.DOTALL | re.IGNORECASE)
    
    if sec_header_match:
        header_content = sec_header_match.group(1)
        type_match = re.search(form_type_pattern, header_content, re.MULTILINE)
        if type_match:
            return type_match.group(1).strip()
    return None


In [19]:
def extract_data_from_xml(root, namespaces, file_name='', folder_name=''):
    """
    Extracts structured data from a parsed XML ElementTree root.
    """
    try:
        # Define all namespace prefixes used in the XML
        ns = {
            'mafiler': 'http://www.sec.gov/edgar/mafiler',
            'com': 'http://www.sec.gov/edgar/common_ma',
            'com1': 'http://www.sec.gov/edgar/common',
            'ma': 'http://www.sec.gov/edgar/ma_drp',
            'ma1': 'http://www.sec.gov/edgar/ma_common_drp',
            'com2': 'http://www.sec.gov/edgar/common_drp'
        }
        
        header_info = root.find('mafiler:headerData', ns)
        filer_info = header_info.find('mafiler:filerInfo', ns) if header_info is not None else None
        filer = filer_info.find('com:filer', ns) if filer_info is not None else None
        contact = filer_info.find('com:contact', ns) if filer_info is not None else None

        header_data = {
            'submissionType': get_text(header_info, 'mafiler:submissionType', ns),
            'filerId': get_text(filer, 'com1:filerId', ns),
            'filerCcc': get_text(filer, 'com1:filerCcc', ns),
            'contactName': get_text(contact, 'com1:name', ns),
            'contactPhoneNumber': get_text(contact, 'com1:phoneNumber', ns),
            'contactEmail': get_text(filer_info, 'com:contactEmail', ns),
            'notificationEmails': get_all_texts(filer_info, 'com:notifications/com1:internetNotificationAddress', ns)
        }
        
        form_data = root.find('mafiler:formData', ns)
        # Filing date is usually in the maExecutionPage/signature section
        filing_date_element = root.find('.//mafiler:maExecutionPage/mafiler:signature/com1:date', ns)
        filing_date = filing_date_element.text if filing_date_element is not None else ''

        # --- New: Extract Firm Name ---
        firm_name = get_text(form_data, 'mafiler:firmName', ns)


        office_locations = []
        principal = form_data.find('mafiler:principalOfficeAddress/mafiler:addressInfo/com:address', ns) if form_data is not None else None
        if principal is not None:
            office_locations.append({
                'type': 'principal',
                'street1': get_text(principal, 'com1:street1', ns),
                'street2': get_text(principal, 'com1:street2', ns),
                'city': get_text(principal, 'com1:city', ns),
                'stateOrCountry': get_text(principal, 'com1:stateOrCountry', ns),
                'zipCode': get_text(principal, 'com1:zipCode', ns),
                'phoneNumber': get_text(form_data, 'mafiler:principalOfficeAddress/mafiler:phoneNumber', ns),
                'faxNumber': get_text(form_data, 'mafiler:principalOfficeAddress/mafiler:faxNumber', ns)                
            })
            
        for office in form_data.findall('mafiler:additionalOffices/mafiler:additionalOffice', ns) if form_data is not None else []:
            address = office.find('mafiler:officeInfo/mafiler:addressInfo/com:address', ns)
            if address is not None:
                office_locations.append({
                    'type': 'additional',
                    'street1': get_text(address, 'com1:street1', ns),
                    'street2': get_text(address, 'com1:street2', ns),
                    'city': get_text(address, 'com1:city', ns),
                    'stateOrCountry': get_text(address, 'com1:stateOrCountry', ns),
                    'zipCode': get_text(address, 'com1:zipCode', ns),
                    'phoneNumber': get_text(office, 'mafiler:officeInfo/mafiler:phoneNumber', ns),
                    'faxNumber': get_text(office, 'mafiler:officeInfo/mafiler:faxNumber', ns)                
                })

        registration_info = {}
        registrations = form_data.find('mafiler:registrations', ns) if form_data is not None else None
        if registrations is not None:
            registration_info = {
                'maTregistration': get_text(registrations, 'mafiler:maTregistration/com:fileNumber', ns),
                'maRegistration': get_text(registrations, 'mafiler:baseRegistrations/mafiler:maRegistration/com:fileNumber', ns),
                'otherRegistration': {
                    'description': get_text(registrations, 'mafiler:baseRegistrations/mafiler:baseRegistrations/com:anotherRegistration/com:description', ns),
                    'id': get_text(registrations, 'mafiler:baseRegistrations/mafiler:baseRegistrations/com:anotherRegistration/com:registrationId', ns)
                }
            }

        number_of_employees = {
            'total': get_text(form_data, 'mafiler:numberOfEmployees', ns),
            'engagedInMAA': get_text(form_data, 'mafiler:employeesEngagedInMAA', ns),
            'maaEmployeesRegBD': get_text(form_data, 'mafiler:maaEmployeesRegBD', ns),
            'maaRegIA': get_text(form_data, 'mafiler:maaRegIA', ns)
        }

        form_org_types_element = form_data.find('mafiler:formOfOrganization/mafiler:formOrgType/mafiler:formOrgTypes', ns)
        form_org_type_name = ''
        if form_org_types_element is not None and len(form_org_types_element) > 0:
            form_org_type_name = form_org_types_element[0].tag.split('}')[-1]

        form_of_organization = {
            'type': form_org_type_name,
            'monthOfFiscalYearEnd': get_text(form_data, 'mafiler:monthOfFiscalYearEnd', ns),
            'organizedJurisdiction': get_text(form_data, 'mafiler:organizedJurisdiction/com1:stateOrCountry', ns),
            'dateOfOrganization': get_text(form_data, 'mafiler:dateOfOrganization', ns)
        }

        clients_info = {
            'numberOfClients': get_text(form_data, 'mafiler:clientsServedAsMA', ns),
            'typesOfClients': get_all_texts(form_data, 'mafiler:typesOfClients/mafiler:typesOfClients/mafiler:clientTypes', ns)
        }

        business_size = {
            'hasAnnualReceiptsLessThan7Million': get_text(form_data, 'mafiler:hasAnnualReceiptsLessThan7Million', ns),
            'isAffiliatedWithReceiptsMoreThan7Million': get_text(form_data, 'mafiler:isAffiliatedWithReceiptsMoreThan7Million', ns)
        }

        other_business_activities = get_all_texts(form_data, 'mafiler:engagedActivities/mafiler:engagedActivityTypes/mafiler:engagedActivityType', ns)
        compensation_agreements = get_all_texts(form_data, 'mafiler:meOrOPCompensationTypes/mafiler:compensationTypes/mafiler:compensationTypes', ns)
        solicitation_activities = get_all_texts(form_data, 'mafiler:typesOfSolicitedPersons/mafiler:solicitationPersonTypes/mafiler:solicitationPersonTypes', ns)

        regulatory_disclosures = {
            'criminalDisclosure': {elem.tag.split('}')[-1]: elem.text for elem in form_data.find('mafiler:disclosureAnswers/mafiler:criminalDisclosure', ns) or []},
            'regulatoryDisclosure': {elem.tag.split('}')[-1]: elem.text for elem in form_data.find('mafiler:disclosureAnswers/mafiler:regulatoryDisclosure', ns) or []},
            'civilDisclosure': {elem.tag.split('}')[-1]: elem.text for elem in form_data.find('mafiler:disclosureAnswers/mafiler:civilDisclosure', ns) or []}
        }

        data = {
            'header_data': header_data,
            'firm_name': firm_name, # --- Added firm name here ---
            'office_locations': office_locations,
            'registration_info': registration_info,
            'number_of_employees': number_of_employees,
            'form_of_organization': form_of_organization,
            'clients_info': clients_info,
            'business_size': business_size,
            'other_business_activities': other_business_activities,
            'compensation_agreements': compensation_agreements,
            'solicitation_activities': solicitation_activities,
            'regulatory_disclosures': regulatory_disclosures,
            'filing_date': filing_date,
            'file_name': file_name,
            'folder_name': folder_name
        }

        return pd.json_normalize(data, sep='_')

    except Exception as e:
        print(f'Error during XML parsing in file {file_name}: {str(e)}')
        return pd.DataFrame()


In [None]:

if __name__ == "__main__":
    # Corrected namespace definitions
    namespaces = {
        'mafiler': 'http://www.sec.gov/edgar/mafiler',
        'com': 'http://www.sec.gov/edgar/common_ma',
        'com1': 'http://www.sec.gov/edgar/common',
        'ma': 'http://www.sec.gov/edgar/ma_drp',
        'ma1': 'http://www.sec.gov/edgar/ma_common_drp',
        'com2': 'http://www.sec.gov/edgar/common_drp'
    }

    # --- Configuration for folder scanning ---
    # root_folder_to_scan = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0002020463" # Example path
    root_folder_to_scan = r"D:\PhD_Fin\Muni_adv_PBF\Data2" # Example path
    output_csv_path = os.path.join(root_folder_to_scan, "combined_ma_filings_3k_3.csv")

    all_dfs = []
    processed_files_count = 0
    total_files_to_process = 0

    # First pass to count relevant .txt files for progress tracking
    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        for filename in filenames:
            if filename.endswith(".txt"):
                total_files_to_process += 1

    print(f"Starting to process {total_files_to_process} .txt files in '{root_folder_to_scan}'...")
    start_time = t.time()

    # Second pass to actually process files
    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        for filename in filenames:
            if filename.endswith(".txt"):
                processed_files_count += 1
                txt_file_path = os.path.join(dirpath, filename)
                print(f"Processing file: {txt_file_path} ({processed_files_count}/{total_files_to_process})")

                extracted_xml_content = None # Initialize for each file

                try:
                    with open(txt_file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        full_txt_content = f.read()

                    # 1. Extract the form type from the SEC-HEADER of the .txt file
                    form_type = extract_form_type_from_header(full_txt_content)

                    if not form_type:
                        print(f"Skipping {filename}: Could not determine FORM TYPE from header.")
                        continue # Skip to the next file if form type isn't found

                    # 2. Extract the pure XML content for the identified form type
                    extracted_xml_content = extract_xml_from_sec_txt_filing(full_txt_content, form_type)

                    if extracted_xml_content:
                        # 3. Save the extracted pure XML to a temporary file
                        # This is crucial for ElementTree.parse to work correctly.
                        temp_xml_file_path = os.path.join(dirpath, os.path.splitext(filename)[0] + "_temp_parsed.xml")
                        with open(temp_xml_file_path, 'w', encoding='utf-8') as f_out:
                            f_out.write(extracted_xml_content)
                        # print(f"Temporarily saved extracted pure XML to: {temp_xml_file_path}")

                        # 4. Parse the temporary XML file
                        tree = ET.parse(temp_xml_file_path, parser=ET.XMLParser(encoding='utf-8'))
                        root1 = tree.getroot()

                        # 5. Extract data into a DataFrame
                        # Pass the firm_name to the function
                        extracted_data = extract_data_from_xml(root1, namespaces, file_name=filename, folder_name=os.path.basename(dirpath))
                        
                        if not extracted_data.empty:
                            all_dfs.append(extracted_data)
                            print(f"Successfully extracted data from {filename}. DataFrame shape: {extracted_data.shape}")
                        else:
                            print(f"No structured data extracted from {filename} despite XML content being found.")

                    else:
                        print(f"No valid XML content (starting with <?xml?> and containing <edgarSubmission>...</edgarSubmission>) extracted from {filename} for form type '{form_type}'.")

                except ET.ParseError as e:
                    print(f"XML Parse Error in {txt_file_path} (after extraction): {str(e)}")
                except Exception as e:
                    print(f"An unexpected error occurred while processing {txt_file_path}: {str(e)}")
                finally:
                    # Clean up the temporary XML file if it was created
                    if 'temp_xml_file_path' in locals() and os.path.exists(temp_xml_file_path):
                        os.remove(temp_xml_file_path)
                        # print(f"Removed temporary file: {temp_xml_file_path}")

    print(f"\nFinished processing all files.")
    print(f"Number of DataFrames collected: {len(all_dfs)}")

    if all_dfs:
        start_concat = t.time()
        combined_df = pd.concat(all_dfs, ignore_index=True)
        end_concat = t.time()
        print(f"Time to concatenate DataFrames: {end_concat - start_concat:.2f} seconds")
        
        total_processing_time = end_concat - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print(f"Combined DataFrame shape: {combined_df.shape}")
        
        combined_df.to_csv(output_csv_path, index=False)
        print(f"Combined data saved to {output_csv_path}")
        
    else:
        total_processing_time = t.time() - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print("No data extracted from any files to form a combined DataFrame.")

Starting to process 31399 .txt files in 'D:\PhD_Fin\Muni_adv_PBF\Data2'...
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000018.txt (1/31399)
Successfully extracted data from MA_0000009211_14_000018.txt. DataFrame shape: (1, 55)
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000019.txt (2/31399)
Error during XML parsing in file MA_0000009211_14_000019.txt: 'NoneType' object has no attribute 'find'
No structured data extracted from MA_0000009211_14_000019.txt despite XML content being found.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000020.txt (3/31399)
Error during XML parsing in file MA_0000009211_14_000020.txt: 'NoneType' object has no attribute 'find'
No structured data extracted from MA_0000009211_14_000020.txt despite XML content being found.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000021.txt (4/31399)
Error during XML parsing in file MA_0000009211_14_000021

In [27]:
combined_df

Unnamed: 0,firm_name,office_locations,other_business_activities,compensation_agreements,solicitation_activities,filing_date,file_name,folder_name,header_data_submissionType,header_data_filerId,...,regulatory_disclosures_regulatoryDisclosure_isFoundMadeFalseStatement,regulatory_disclosures_regulatoryDisclosure_isFoundInViolationOfRules,regulatory_disclosures_regulatoryDisclosure_isFoundInCauseOfSuspension,regulatory_disclosures_regulatoryDisclosure_isDiscipliend,regulatory_disclosures_regulatoryDisclosure_isAuthorizedToActAttorney,regulatory_disclosures_regulatoryDisclosure_isRegulatoryComplaint,regulatory_disclosures_civilDisclosure_isEnjoined,regulatory_disclosures_civilDisclosure_isFoundInViolationOfRegulation,regulatory_disclosures_civilDisclosure_isDismissed,regulatory_disclosures_civilDisclosure_isNamedInCivilProceeding
0,ROBERT W. BAIRD & CO. INCORPORATED,"[{'type': 'principal', 'street1': '777 E. WISC...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],07-11-2014,MA_0000009211_14_000018.txt,0000009211,MA,0000009211,...,N,Y,N,N,N,N,N,N,N,N
1,ROBERT W. BAIRD & CO. Inc,"[{'type': 'principal', 'street1': '777 E. WISC...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],03-23-2015,MA_0000009211_15_000011.txt,0000009211,MA-A,0000009211,...,N,Y,N,N,N,N,N,N,N,N
2,ROBERT W. BAIRD & CO. INCORPORATED,"[{'type': 'principal', 'street1': '777 E. WISC...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],06-25-2015,MA_0000009211_15_000026.txt,0000009211,MA/A,0000009211,...,N,Y,N,N,N,N,N,N,N,N
3,ROBERT W. BAIRD & CO. INCORPORATED,"[{'type': 'principal', 'street1': '777 E. WISC...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],03-28-2016,MA_0000009211_16_000043.txt,0000009211,MA-A,0000009211,...,N,Y,N,N,N,N,N,N,N,N
4,ROBERT W. BAIRD & CO. INCORPORATED,"[{'type': 'principal', 'street1': '777 E. WISC...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],05-04-2016,MA_0000009211_16_000048.txt,0000009211,MA/A,0000009211,...,N,Y,N,N,N,N,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7858,"First Security Municipal Advisors, Inc.","[{'type': 'principal', 'street1': '501 PRESIDE...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],12-31-2024,MA_0002041221_24_000006.txt,0002041221,MA,0002041221,...,N,N,N,N,N,N,N,N,N,N
7859,"First Security Municipal Advisors, Inc.","[{'type': 'principal', 'street1': '501 PRESIDE...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],01-16-2025,MA_0002041221_25_000015.txt,0002041221,MA/A,0002041221,...,N,N,N,N,N,N,N,N,N,N
7860,"First Security Municipal Advisors, Inc.","[{'type': 'principal', 'street1': '501 PRESIDE...","[Advice Insurance Of MS, Advice Investment Of ...","[Hourly Charges, Fixed Fees, Contingent Fees]",[Not Applicable],02-06-2025,MA_0002041221_25_000017.txt,0002041221,MA/A,0002041221,...,N,N,N,N,N,N,N,N,N,N
7861,Patriot Advisors LLC,[],"[Advice Insurance Of MS, Advice Investment Of ...",[Hourly Charges],[Not Applicable],02-13-2025,MA_0002053271_25_000007.txt,0002053271,MA,0002053271,...,N,N,N,N,N,N,N,N,N,N


In [29]:
combined_df.to_csv(r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\Data\output_step_3k.csv", index=False)

In [31]:
#check missings

missing_counts = combined_df.isnull().sum()
print("Missings = \n", missing_counts)

missing_pct = (missing_counts / len(combined_df))*100
print("Missing Pct = \n", missing_pct)


Missings = 
 firm_name                                                                       0
office_locations                                                                0
other_business_activities                                                       0
compensation_agreements                                                         0
solicitation_activities                                                         0
filing_date                                                                     0
file_name                                                                       0
folder_name                                                                     0
header_data_submissionType                                                      0
header_data_filerId                                                             0
header_data_filerCcc                                                            0
header_data_contactName                                                         0
hea