In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import re
import time as t
from typing import Union


In [4]:
def get_text(element, path, namespaces=None):
    """Safely gets text from an XML element found by path."""
    if element is None:
        return ''
    found = element.find(path, namespaces)
    return found.text.strip() if found is not None and found.text is not None else ''

def get_all_texts(element, path, namespaces=None):
    """Safely gets all texts from multiple XML elements found by path."""
    if element is None:
        return []
    return [e.text.strip() for e in element.findall(path, namespaces) if e is not None and e.text]

def extract_xml_from_sec_txt_filing(txt_content: str, form_type: str) -> Union[str, None]:
    """
    Extracts the pure XML content (starting with <?xml...?> and ending with </edgarSubmission>)
    for a specific form type from an SEC .txt filing's full text content.
    """
    # First, find the block containing the desired XML within the <DOCUMENT><TEXT> section.
    # The pattern is made robust to handle variations like missing </TYPE> closing tag.
    # Using re.escape(form_type) to handle form types with special regex characters if any.
    text_block_pattern = r"<DOCUMENT>.*?<TYPE>\s*" + re.escape(form_type) + r".*?<TEXT>(.*?)</TEXT>\s*</DOCUMENT>"

    text_block_match = re.search(text_block_pattern, txt_content, re.DOTALL | re.IGNORECASE)

    if text_block_match:
        raw_content_within_text_tag = text_block_match.group(1)

        # Now, within this raw content, precisely find the actual XML string
        # from the <?xml declaration up to the closing </edgarSubmission> tag.
        # This will correctly exclude any outer <XML> or other non-XML wrappers.
        # Added a non-greedy match for the root element content to handle cases where
        # there might be other tags or whitespace after the root element.
        # The pattern looks for the XML declaration and then the <edgarSubmission> root element.
        final_xml_pattern = r"<\?xml(?:[^\"'>]|\"[^\"]*\"|'[^']*')*?\?>\s*(<edgarSubmission.*?</edgarSubmission>)"

        xml_content_match = re.search(final_xml_pattern, raw_content_within_text_tag, re.DOTALL | re.IGNORECASE)

        if xml_content_match:
            # Return the matched XML content, removing any leading/trailing whitespace
            return xml_content_match.group(0).strip()
        else:
            return None
    else:
        return None


In [6]:
def extract_form_type_from_header(txt_content: str) -> Union[str, None]:
    """
    Extracts the FORM TYPE from the SEC-HEADER section of a .txt filing.
    """
    # Regex to find FORM TYPE in SEC-HEADER
    form_type_pattern = r"^\s*FORM TYPE:\s*([A-Za-z0-9\.-]+)"

    sec_header_match = re.search(r"<SEC-HEADER>(.*?)</SEC-HEADER>", txt_content, re.DOTALL | re.IGNORECASE)

    if sec_header_match:
        header_content = sec_header_match.group(1)
        type_match = re.search(form_type_pattern, header_content, re.MULTILINE)
        if type_match:
            return type_match.group(1).strip()
    return None


In [8]:
def parse_ma_i_filing(root: ET.Element, namespaces: dict, file_name: str, folder_name: str) -> Union[dict, None]:
    """
    Parses an already-parsed ElementTree root object of a Form MA-I XML and extracts data.

    Args:
        root (ET.Element): The root element of the parsed XML tree.
        namespaces (dict): A dictionary of XML namespaces.
        file_name (str): The name of the file (e.g., "MA_0001614240_14_000002.txt").
        folder_name (str): The name of the subfolder the file is considered to be in.

    Returns:
        dict: A dictionary containing extracted data for one filing, or None if parsing fails.
    """
    try:
        # Initialize data dictionary for this filing
        filing_data = {
            'FileName': file_name,
            'FolderName': folder_name,
        }

        # --- Header Data Extraction (using maifiler namespace for headerData) ---
        header_info = root.find('maifiler:headerData', namespaces=namespaces)
        if header_info is None:
            print(f"Warning: Could not find headerData for MA-I in file {file_name}.")
            return None # Return None if headerData is critical and missing

        filing_data['SubmissionType_XML'] = get_text(header_info, 'maifiler:submissionType', namespaces)
        filing_data['FilerId'] = get_text(header_info, 'maifiler:filerInfo/com:filer/com1:filerId', namespaces)
        filing_data['FilerCcc'] = get_text(header_info, 'maifiler:filerInfo/com:filer/com1:filerCcc', namespaces)
        filing_data['FilerFileNumber'] = get_text(header_info, 'maifiler:filerInfo/com:filer/com1:filerFileNumber', namespaces)

        # --- MODIFIED LINES BELOW ---
        filing_data['ContactName'] = get_text(header_info, 'maifiler:filerInfo/com:contact/com1:name', namespaces)
        filing_data['ContactPhoneNumber'] = get_text(header_info, 'maifiler:filerInfo/com:contact/com1:phoneNumber', namespaces)
        filing_data['ContactEmail'] = get_text(header_info, 'maifiler:filerInfo/com:contactEmail', namespaces)
        filing_data['NotificationEmails'] = "; ".join(get_all_texts(header_info, 'maifiler:filerInfo/com:notifications/com1:internetNotificationAddress', namespaces))
        # --- END MODIFIED LINES ---

        # ... (rest of your code remains the same) ...

        # --- Form Data Root ---
        form_data_root = root.find('maifiler:formData', namespaces=namespaces)
        if form_data_root is None:
            print(f"Warning: Could not find formData for MA-I in file {file_name}.")
            return None

        filing_data['IsAmendment'] = get_text(form_data_root, 'maifiler:isAmendment', namespaces)
        filing_data['HasMoreThanOneAdvisoryFirms'] = get_text(form_data_root, 'maifiler:hasMoreThanOneAdvisoryFirms', namespaces)
        filing_data['NoOfAdvisoryFirms'] = get_text(form_data_root, 'maifiler:noOfAdvisoryFirms', namespaces)

        # --- Applicant Name (Individual Employee) ---
        applicant_name_node = form_data_root.find('maifiler:applicantName', namespaces=namespaces)
        filing_data['Applicant_FirstName'] = get_text(applicant_name_node, 'com:firstName', namespaces)
        filing_data['Applicant_MiddleName'] = get_text(applicant_name_node, 'com:middleName', namespaces)
        filing_data['Applicant_LastName'] = get_text(applicant_name_node, 'com:lastName', namespaces)

        # --- Municipal Advisor Offices (Repeating Section - Firm Name and associated offices) ---
        # Collect details for each municipalAdvisorOffice
        ma_office_details = []
        for office_node in form_data_root.findall('maifiler:municipalAdvisorOffices/maifiler:municipalAdvisorOffice', namespaces=namespaces):
            firm_name = get_text(office_node, 'maifiler:municipalFirm/maifiler:municipalFirmName', namespaces)
            commenced_date = get_text(office_node, 'maifiler:municipalFirm/maifiler:recentEmploymentCommencedDate', namespaces)
            is_independent = get_text(office_node, 'maifiler:municipalFirm/maifiler:isIndependentRelatioship', namespaces)

            # Advisor Office Addresses for this municipalAdvisorOffice
            office_addresses = []
            for addr_node in office_node.findall('maifiler:advisorOffices/maifiler:advisorOffice/maifiler:locationInfo/maifiler:addressInfo/com:address', namespaces=namespaces):
                street1 = get_text(addr_node, 'com1:street1', namespaces)
                street2 = get_text(addr_node, 'com1:street2', namespaces)
                city = get_text(addr_node, 'com1:city', namespaces)
                state = get_text(addr_node, 'com1:stateOrCountry', namespaces)
                zip_code = get_text(addr_node, 'com1:zipCode', namespaces)

                address_parts = [p for p in [street1, street2, city, state, zip_code] if p]
                office_addresses.append(", ".join(address_parts))

            ma_office_details.append({
                'FirmName': firm_name,
                'EmploymentCommencedDate': commenced_date,
                'IsIndependentRelationship': is_independent,
                'OfficeAddresses': "; ".join(office_addresses) if office_addresses else None
            })
        filing_data['MunicipalAdvisorOffices'] = ma_office_details

        # --- Residential History (if present) ---
        # Only capture if the section exists, indicating presence rather than deep parsing unless needed.
        filing_data['ResidentialHistoryPresent'] = "Y" if form_data_root.find('maifiler:residentialHistory', namespaces=namespaces) is not None else "N"

        # --- Employment History (Current and Prior) ---
        current_employer_node = form_data_root.find('maifiler:employmentHistory/maifiler:currentEmployer', namespaces=namespaces)
        filing_data['CurrentEmployer_Name'] = get_text(current_employer_node, 'maifiler:name', namespaces)
        filing_data['CurrentEmployer_StartDate'] = get_text(current_employer_node, 'maifiler:startDate', namespaces)
        filing_data['CurrentEmployer_EndDate'] = get_text(current_employer_node, 'maifiler:endDate', namespaces) # Added, though often empty
        filing_data['CurrentEmployer_Position'] = get_text(current_employer_node, 'maifiler:positionDescription', namespaces)
        filing_data['CurrentEmployer_IsRelatedToMunicipalAdvisor'] = get_text(current_employer_node, 'maifiler:isRelatedToMunicipalAdvisor', namespaces)
        filing_data['CurrentEmployer_IsRelatedToInvestment'] = get_text(current_employer_node, 'maifiler:isRelatedToInvestment', namespaces)

        prior_employers = []
        for prior_node in form_data_root.findall('maifiler:employmentHistory/maifiler:priorEmployers/maifiler:priorEmployer', namespaces=namespaces):
            prior_employers.append({
                'Name': get_text(prior_node, 'maifiler:name', namespaces),
                'StartDate': get_text(prior_node, 'maifiler:startDate', namespaces),
                'EndDate': get_text(prior_node, 'maifiler:endDate', namespaces),
                'Position': get_text(prior_node, 'maifiler:positionDescription', namespaces)
            })
        filing_data['PriorEmployers'] = prior_employers

        # --- Other Business Activities (Repeating Section) ---
        other_businesses = []
        for ob_node in form_data_root.findall('maifiler:otherBusinesses/maifiler:otherBusiness', namespaces=namespaces):
            other_businesses.append({
                'Name': get_text(ob_node, 'maifiler:name', namespaces),
                'Nature': get_text(ob_node, 'maifiler:natureOfBusiness', namespaces),
                'Position': get_text(ob_node, 'maifiler:positionDescription', namespaces),
                'StartDate': get_text(ob_node, 'maifiler:startDate', namespaces),
                'ApproxHoursOrMonths': get_text(ob_node, 'maifiler:approximateHoursOrMonths', namespaces)
            })
        filing_data['OtherBusinesses'] = other_businesses

        # --- Disclosure Questions (Items 1-6 and sub-parts) ---
        disclosure_questions_node = form_data_root.find('maifiler:disclosureQuestions', namespaces=namespaces)
        if disclosure_questions_node:
            # Item 1: Criminal Disclosure
            criminal_disclosure = disclosure_questions_node.find('maifiler:criminalDisclosure', namespaces=namespaces)
            if criminal_disclosure:
                filing_data['Disclosure_Crim_ConvictedOfFelony'] = get_text(criminal_disclosure, 'com:isConvictedOfFelony', namespaces)
                filing_data['Disclosure_Crim_ChargedWithFelony'] = get_text(criminal_disclosure, 'com:isChargedWithFelony', namespaces)
                filing_data['Disclosure_Crim_ConvictedOfMisdemeanor'] = get_text(criminal_disclosure, 'maifiler:isConvictedOfMisdemeanor', namespaces)
                filing_data['Disclosure_Crim_ChargedWithMisdemeanor'] = get_text(criminal_disclosure, 'maifiler:isChargedWithMisdemeanor', namespaces)

            # Item 2: Regulatory Disclosure
            regulatory_disclosure = disclosure_questions_node.find('maifiler:regulatoryDisclosure', namespaces=namespaces)
            if regulatory_disclosure:
                filing_data['Disclosure_Reg_MadeFalseStatement'] = get_text(regulatory_disclosure, 'com:isMadeFalseStatement', namespaces)
                filing_data['Disclosure_Reg_ViolatedRegulation'] = get_text(regulatory_disclosure, 'com:isViolatedRegulation', namespaces)
                filing_data['Disclosure_Reg_OrderAgainst'] = get_text(regulatory_disclosure, 'com:isOrderAgainst', namespaces)
                filing_data['Disclosure_Reg_DeniedLicense'] = get_text(regulatory_disclosure, 'com:isDeniedLicense', namespaces)
                filing_data['Disclosure_Reg_ViolatedSecurityAct'] = get_text(regulatory_disclosure, 'maifiler:isViolatedSecurityAct', namespaces)
                filing_data['Disclosure_Reg_AssociationBared'] = get_text(regulatory_disclosure, 'maifiler:isAssociationBared', namespaces)

            # Item 3: Investigation Disclosure
            investigation_disclosure = disclosure_questions_node.find('maifiler:investigationDisclosure', namespaces=namespaces)
            if investigation_disclosure:
                filing_data['Disclosure_Inv_IsInvestigated'] = get_text(investigation_disclosure, 'maifiler:isInvestigated', namespaces)

            # Item 4: Civil Disclosure
            civil_disclosure = disclosure_questions_node.find('maifiler:civilDisclosure', namespaces=namespaces)
            if civil_disclosure:
                filing_data['Disclosure_Civil_IsEnjoined'] = get_text(civil_disclosure, 'com:isEnjoined', namespaces)
                filing_data['Disclosure_Civil_FoundInViolationOfRegulation'] = get_text(civil_disclosure, 'com:isFoundInViolationOfRegulation', namespaces)
                filing_data['Disclosure_Civil_IsDismissed'] = get_text(civil_disclosure, 'com:isDismissed', namespaces)
                filing_data['Disclosure_Civil_NamedInCivilProceeding'] = get_text(civil_disclosure, 'com:isNamedInCivilProceeding', namespaces)


            # Item 5: Complaint Disclosure
            complaint_disclosure = disclosure_questions_node.find('maifiler:complaintDisclosure', namespaces=namespaces)
            if complaint_disclosure:
                filing_data['Disclosure_Comp_IsComplaintPending'] = get_text(complaint_disclosure, 'maifiler:isComplaintPending', namespaces)
                filing_data['Disclosure_Comp_IsComplaintSettled'] = get_text(complaint_disclosure, 'maifiler:isComplaintSettled', namespaces)
                filing_data['Disclosure_Comp_IsFraudCasePending'] = get_text(complaint_disclosure, 'maifiler:isFraudCasePending', namespaces)
                filing_data['Disclosure_Comp_IsFraudCaseResultedAward'] = get_text(complaint_disclosure, 'maifiler:isFraudCaseResultedAward', namespaces)
                filing_data['Disclosure_Comp_IsFraudCaseSettled'] = get_text(complaint_disclosure, 'maifiler:isFraudCaseSettled', namespaces)

            # Item 6: Termination Disclosure (Added as per sample structure, though not explicitly requested as an 'Item' by user, but part of disclosures)
            termination_disclosure = disclosure_questions_node.find('maifiler:terminationDisclosure', namespaces=namespaces)
            if termination_disclosure:
                filing_data['Disclosure_Term_IsViolatedIndustryStandard'] = get_text(termination_disclosure, 'maifiler:isViloatedIndustryStandard', namespaces) # Note: 'Viloated' typo in XML, matched as is.
                filing_data['Disclosure_Term_IsInvolvedInFraud'] = get_text(termination_disclosure, 'maifiler:isInvolvedInFraud', namespaces)
                filing_data['Disclosure_Term_IsFailedToSupervise'] = get_text(termination_disclosure, 'maifiler:isFailedToSupervise', namespaces)

            # Item 7: Financial Disclosure
            financial_disclosure = disclosure_questions_node.find('maifiler:financialDisclosure', namespaces=namespaces)
            if financial_disclosure:
                filing_data['Disclosure_Fin_IsCompromised'] = get_text(financial_disclosure, 'maifiler:isCompromised', namespaces)
                filing_data['Disclosure_Fin_IsBankruptcyPetition'] = get_text(financial_disclosure, 'maifiler:isBankruptcyPetition', namespaces)
                filing_data['Disclosure_Fin_IsTrusteeApointed'] = get_text(financial_disclosure, 'maifiler:isTrusteeApointed', namespaces)
                filing_data['Disclosure_Fin_IsBondRevoked'] = get_text(financial_disclosure, 'maifiler:isBondRevoked', namespaces)

            # Item 8: Judgment/Lien Disclosure (Added as per sample structure, though not explicitly requested as an 'Item' by user, but part of disclosures)
            judgment_lien_disclosure = disclosure_questions_node.find('maifiler:judgmentLienDisclosure', namespaces=namespaces)
            if judgment_lien_disclosure:
                filing_data['Disclosure_JudgLien_IsLienAgainst'] = get_text(judgment_lien_disclosure, 'maifiler:isLienAgainst', namespaces)

        # --- Signature Info ---
        signature_info_node = form_data_root.find('maifiler:signatureInfo', namespaces=namespaces)
        filing_data['Signature_DateSigned'] = get_text(signature_info_node, 'com:signature/com:dateSigned', namespaces)
        filing_data['Signature_Signature'] = get_text(signature_info_node, 'com:signature/com:signature', namespaces)
        filing_data['Signature_Title'] = get_text(signature_info_node, 'com:signature/com:title', namespaces)

        return filing_data

    except Exception as e:
        print(f"An error occurred while extracting data from XML for {file_name}: {e}")
        return None


In [10]:
def process_ma_i_filings_in_folder(root_folder_to_scan: str) -> pd.DataFrame:
    """
    Processes all Form MA-I .txt filings within a root folder and its subfolders,
    extracting relevant information into a pandas DataFrame.
    """
    all_filings_data = []
    processed_files_count = 0
    total_files_to_process = 0

    # Namespace definitions for parsing XML - IMPORTANT: added maifiler namespace
    namespaces = {
        'maifiler': 'http://www.sec.gov/edgar/maifiler',
        'com': 'http://www.sec.gov/edgar/common_ma',
        'com1': 'http://www.sec.gov/edgar/common'
    }

    # First pass to count relevant .txt files for progress tracking
    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        for filename in filenames:
            if filename.endswith(".txt"):
                total_files_to_process += 1

    print(f"Starting to process {total_files_to_process} .txt files in '{root_folder_to_scan}'...")
    start_time = t.time()

    # Second pass to actually process files
    for dirpath, dirnames, filenames in os.walk(root_folder_to_scan):
        folder_name = os.path.basename(dirpath) # Get the current subfolder name
        for filename in filenames:
            if filename.endswith(".txt"):
                processed_files_count += 1
                txt_file_path = os.path.join(dirpath, filename)
                print(f"Processing file: {txt_file_path} ({processed_files_count}/{total_files_to_process})")

                try:
                    with open(txt_file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        full_txt_content = f.read()

                    # 1. Extract the form type from the SEC-HEADER of the .txt file
                    form_type = extract_form_type_from_header(full_txt_content)

                    if not form_type:
                        print(f"Skipping {filename}: Could not determine FORM TYPE from header.")
                        continue

                    # *** IMPORTANT CHANGE: Filter to ONLY process MA-I forms ***
                    if form_type != 'MA-I':
                        print(f"Skipping {filename}: Form type '{form_type}' is not MA-I, this script processes MA-I only.")
                        continue

                    # 2. Extract the pure XML content for the identified form type (which must be MA-I)
                    extracted_xml_content = extract_xml_from_sec_txt_filing(full_txt_content, form_type)

                    if extracted_xml_content:
                        # 3. Parse the XML string directly (no temporary file needed)
                        root = ET.fromstring(extracted_xml_content)

                        # 4. Extract data into a dictionary using the refined parse_ma_i_filing
                        extracted_data_dict = parse_ma_i_filing(root, namespaces, file_name=filename, folder_name=folder_name)

                        # Add header data extracted from the raw .txt content (if not already in XML)
                        # These are often consistent and easy to get from the text header.
                        accession_number_match = re.search(r'ACCESSION NUMBER:\s*(\S+)', full_txt_content)
                        conformed_submission_type_match = re.search(r'CONFORMED SUBMISSION TYPE:\s*(\S+)', full_txt_content)
                        filed_as_of_date_match = re.search(r'FILED AS OF DATE:\s*(\S+)', full_txt_content)

                        if extracted_data_dict: # Ensure dict is not None
                            extracted_data_dict['AccessionNumber_Header'] = accession_number_match.group(1) if accession_number_match else None
                            extracted_data_dict['SubmissionType_Header'] = conformed_submission_type_match.group(1) if conformed_submission_type_match else None
                            extracted_data_dict['FiledAsOfDate_Header'] = filed_as_of_date_match.group(1) if filed_as_of_date_match else None
                            all_filings_data.append(extracted_data_dict)
                            print(f"Successfully extracted data from {filename} (Form Type: {form_type}).")
                        else:
                            print(f"No structured data extracted from {filename} (Form Type: {form_type}) despite XML content being found.")

                    else:
                        print(f"No valid XML content (starting with <?xml?> and containing <edgarSubmission>...</edgarSubmission>) extracted from {filename} for form type '{form_type}'.")

                except ET.ParseError as e:
                    print(f"XML Parse Error in {txt_file_path}: {str(e)}")
                except Exception as e:
                    print(f"An unexpected error occurred while processing {txt_file_path}: {str(e)}")

    print(f"\nFinished processing all files.")
    print(f"Number of filings processed: {len(all_filings_data)}")

    if all_filings_data:
        start_concat = t.time()
        combined_df = pd.DataFrame(all_filings_data)
        end_concat = t.time()
        print(f"Time to convert to DataFrame: {end_concat - start_concat:.2f} seconds")

        total_processing_time = end_concat - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print(f"Combined DataFrame shape: {combined_df.shape}")

        # Flatten 'MunicipalAdvisorOffices', 'PriorEmployers', 'OtherBusinesses' for better DataFrame representation
        if 'MunicipalAdvisorOffices' in combined_df.columns:
            combined_df['MunicipalAdvisorOffices_Details'] = combined_df['MunicipalAdvisorOffices'].apply(
                lambda x: "; ".join([f"Firm: {d.get('FirmName', 'N/A')}, Commenced: {d.get('EmploymentCommencedDate', 'N/A')}, Independent: {d.get('IsIndependentRelationship', 'N/A')}, Offices: [{d.get('OfficeAddresses', 'N/A')}]" for d in x]) if x else None
            )
            combined_df = combined_df.drop(columns=['MunicipalAdvisorOffices'])

        if 'PriorEmployers' in combined_df.columns:
            combined_df['PriorEmployers_Details'] = combined_df['PriorEmployers'].apply(
                lambda x: "; ".join([f"Name: {d.get('Name', 'N/A')}, Start: {d.get('StartDate', 'N/A')}, End: {d.get('EndDate', 'N/A')}, Position: {d.get('Position', 'N/A')}" for d in x]) if x else None
            )
            combined_df = combined_df.drop(columns=['PriorEmployers'])

        if 'OtherBusinesses' in combined_df.columns:
            combined_df['OtherBusinesses_Details'] = combined_df['OtherBusinesses'].apply(
                lambda x: "; ".join([f"Name: {d.get('Name', 'N/A')}, Nature: {d.get('Nature', 'N/A')}, Position: {d.get('Position', 'N/A')}, Start: {d.get('StartDate', 'N/A')}, Hours/Months: {d.get('ApproxHoursOrMonths', 'N/A')}" for d in x]) if x else None
            )
            combined_df = combined_df.drop(columns=['OtherBusinesses'])

        return combined_df

    else:
        total_processing_time = t.time() - start_time
        print(f"Total script execution time: {total_processing_time:.2f} seconds")
        print("No data extracted or DataFrame is empty.")
        return pd.DataFrame() # Return empty DataFrame if no data


In [None]:

# --- Main execution block ---
if __name__ == "__main__":
    # --- Configuration for folder scanning ---
    # root_directory_for_filings = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\Data\0001614240"
    root_directory_for_filings = r"D:\PhD_Fin\Muni_adv_PBF\Data2"

    # Process the filings and get the DataFrame
    ma_i_dataframe = process_ma_i_filings_in_folder(root_directory_for_filings)

    if not ma_i_dataframe.empty:
        print("\n--- Extracted Data DataFrame (First 5 Rows) ---")
        pd.set_option('display.max_columns', None) # Show all columns
        pd.set_option('display.width', 1000)       # Expand display width
        print(ma_i_dataframe.head()) # Print only the head for brevity
        print(f"\nDataFrame columns: {ma_i_dataframe.columns.tolist()}")

        output_csv_path = os.path.join(root_directory_for_filings, "sec_form_ma_i.csv")
        ma_i_dataframe.to_csv(output_csv_path, index=False)
        print(f"\nCombined data saved to {output_csv_path}")

    else:
        print("\nNo data extracted or DataFrame is empty.")


Starting to process 31399 .txt files in 'D:\PhD_Fin\Muni_adv_PBF\Data2'...
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000018.txt (1/31399)
Skipping MA_0000009211_14_000018.txt: Form type 'MA' is not MA-I, this script processes MA-I only.
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000019.txt (2/31399)
Successfully extracted data from MA_0000009211_14_000019.txt (Form Type: MA-I).
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000020.txt (3/31399)
Successfully extracted data from MA_0000009211_14_000020.txt (Form Type: MA-I).
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000021.txt (4/31399)
Successfully extracted data from MA_0000009211_14_000021.txt (Form Type: MA-I).
Processing file: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000022.txt (5/31399)
Successfully extracted data from MA_0000009211_14_000022.txt (Form Type: MA-I).
Processing file: D:\PhD_Fi

In [13]:
ma_i_dataframe

Unnamed: 0,FileName,FolderName,SubmissionType_XML,FilerId,FilerCcc,FilerFileNumber,ContactName,ContactPhoneNumber,ContactEmail,NotificationEmails,IsAmendment,HasMoreThanOneAdvisoryFirms,NoOfAdvisoryFirms,Applicant_FirstName,Applicant_MiddleName,Applicant_LastName,ResidentialHistoryPresent,CurrentEmployer_Name,CurrentEmployer_StartDate,CurrentEmployer_EndDate,CurrentEmployer_Position,CurrentEmployer_IsRelatedToMunicipalAdvisor,CurrentEmployer_IsRelatedToInvestment,Disclosure_Crim_ConvictedOfFelony,Disclosure_Crim_ChargedWithFelony,Disclosure_Crim_ConvictedOfMisdemeanor,Disclosure_Crim_ChargedWithMisdemeanor,Disclosure_Reg_MadeFalseStatement,Disclosure_Reg_ViolatedRegulation,Disclosure_Reg_OrderAgainst,Disclosure_Reg_DeniedLicense,Disclosure_Reg_ViolatedSecurityAct,Disclosure_Reg_AssociationBared,Disclosure_Inv_IsInvestigated,Disclosure_Civil_IsEnjoined,Disclosure_Civil_FoundInViolationOfRegulation,Disclosure_Civil_IsDismissed,Disclosure_Civil_NamedInCivilProceeding,Disclosure_Comp_IsComplaintPending,Disclosure_Comp_IsComplaintSettled,Disclosure_Comp_IsFraudCasePending,Disclosure_Comp_IsFraudCaseResultedAward,Disclosure_Comp_IsFraudCaseSettled,Disclosure_Term_IsViolatedIndustryStandard,Disclosure_Term_IsInvolvedInFraud,Disclosure_Term_IsFailedToSupervise,Disclosure_Fin_IsCompromised,Disclosure_Fin_IsBankruptcyPetition,Disclosure_Fin_IsTrusteeApointed,Disclosure_Fin_IsBondRevoked,Disclosure_JudgLien_IsLienAgainst,Signature_DateSigned,Signature_Signature,Signature_Title,AccessionNumber_Header,SubmissionType_Header,FiledAsOfDate_Header,MunicipalAdvisorOffices_Details,PriorEmployers_Details,OtherBusinesses_Details
0,MA_0000009211_14_000019.txt,0000009211,MA-I,0000009211,XXXXXXXX,,Tamara Olszewski,414-298-7590,registrationdepartment@rwbaird.com,,N,N,1,Erica,Lynn,Willems,Y,Robert W. Baird & Co.,07-2001,,FICM RISK & REGULATORY AFFAIRS MANAGER,Y,Y,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,07-21-2014,Tamara Olszewski,Registrations Manager,0000009211-14-000019,MA-I,20140721,"Firm: ROBERT W. BAIRD & CO. INCORPORATED, Comm...",,
1,MA_0000009211_14_000020.txt,0000009211,MA-I,0000009211,XXXXXXXX,,Tamara Olszewski,414-298-7590,registrationdepartment@rwbaird.com,,N,N,1,Todd,LEGGATT,Barnes,Y,ROBERT W. BAIRD & CO. INCORPORATED,07-2009,,Public Finance Banker,Y,Y,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,07-22-2014,Tamara Olszewski,Registration Manager,0000009211-14-000020,MA-I,20140722,"Firm: ROBERT W. BAIRD & CO. INCORPORATED, Comm...","Name: WACHOVIA CAPITAL MARKETS, LLC, Start: 11...",
2,MA_0000009211_14_000021.txt,0000009211,MA-I,0000009211,XXXXXXXX,,Tamara Olszewski,414-298-7590,registrationdepartment@rwbaird.com,,N,N,1,LESLIE,LEE,BEAR,Y,ROBERT W. BAIRD & CO. INC.,06-2008,,Public Finance Banker,Y,Y,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,07-22-2014,Tamara Olszewski,Registration Manager,0000009211-14-000021,MA-I,20140722,"Firm: ROBERT W. BAIRD & CO. INCORPORATED, Comm...","Name: FERRIS, BAKER WATTS INCORPORATED, Start:...",
3,MA_0000009211_14_000022.txt,0000009211,MA-I,0000009211,XXXXXXXX,,Tamara Olszewski,414-298-7590,registrationdepartment@rwbaird.com,,N,N,1,ROBERT,KIRK,LONDON,Y,ROBERT W. BAIRD & CO.,06-2012,,VICE PRESIDENT - PUBLIC FINANCE,Y,Y,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,07-22-2014,Tamara Olszewski,Registration Manager,0000009211-14-000022,MA-I,20140722,"Firm: ROBERT W. BAIRD & CO. INCORPORATED, Comm...","Name: RAYMOND JAMES MORGAN KEEGAN & COMPANY, S...",
4,MA_0000009211_14_000023.txt,0000009211,MA-I,0000009211,XXXXXXXX,,Tamara Olszewski,414-298-7590,registrationdepartment@rwbaird.com,,N,N,1,JOHN,ANTHONY,MEHAN,Y,ROBERT W. BAIRD & CO. INCORPORATED,06-1987,,INVESTMENT BANKER,Y,Y,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,07-22-2014,Tamara Olszewski,Registration Manager,0000009211-14-000023,MA-I,20140722,"Firm: ROBERT W. BAIRD & CO. INCORPORATED, Comm...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23081,MA_0002041221_25_000016.txt,0002041221,MA-I/A,0002041221,XXXXXXXX,,Walter D. Lomax III,5019787915,blomax@crewsfs.com,scott@fsbeardsley.com; ainverso@muniadvisors.com,N,N,1,Matthew,Andrew,Spoerndle,Y,"First Security Municipal Advisors, Inc.",01-2025,,Sr. Managing Director,Y,N,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,02-05-2025,Walter D. Lomax,Chief Compliance Officer,0002041221-25-000016,MA-I/A,20250205,"Firm: First Security Municipal Advisors, Inc.,...","Name: Phoenix Advisors, LLC, Start: 02-2008, E...",
23082,MA_0002041221_25_000018.txt,0002041221,MA-I/A,0002041221,XXXXXXXX,,Walter D. Lomax III,5019787915,blomax@crewsfs.com,scott@fsbeardsley.com; ainverso@muniadvisors.com,N,N,1,Riley,Lewis,Green,Y,"First Security Municipal Advisors, Inc.",01-2025,,Analyst,Y,N,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,02-06-2025,Walter Devillers Lomax,Chief Compliance Officer,0002041221-25-000018,MA-I/A,20250206,"Firm: First Security Municipal Advisors, Inc.,...","Name: Phoenix Advisors, LLC, Start: 10-2014, E...",
23083,MA_0002041221_25_000019.txt,0002041221,MA-I/A,0002041221,XXXXXXXX,,Walter D. Lomax III,5019787915,blomax@crewsfs.com,scott@fsbeardsley.com; ainverso@muniadvisors.com,N,N,1,Roberta,Salema,Acampora,Y,"First Security Municipal Advisors, Inc.",01-2025,,Managing Director,Y,N,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,02-06-2025,Walter Devillers Lomax,Chief Compliance Officer,0002041221-25-000019,MA-I/A,20250206,"Firm: First Security Municipal Advisors, Inc.,...","Name: Phoenix Advisors, LLC, Start: 01-2013, E...",
23084,MA_0002053271_25_000003.txt,0002053271,MA-I,0002053271,XXXXXXXX,,Kevin George Quinn,4104562376,kquinn@patriot-advisors.com,kquinn@patriot-advisors.com,N,N,1,Kevin,George,Quinn,Y,Patriot Advisors LLC,01-2025,,Managing Member,Y,N,,,N,N,,,,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,Y,,,,0002053271-25-000003,MA-I,20250204,"Firm: Patriot Advisors LLC, Commenced: 01-23-2...","Name: Quinn Consulting, Start: 07-2024, End: 0...","Name: Quinn Consulting, Nature: Strategic plan..."


In [17]:
#check missings

missing_counts = ma_i_dataframe.isnull().sum()
print("Missings = \n", missing_counts)

missing_pct = (missing_counts / len(ma_i_dataframe))*100
print("Missing Pct = \n", missing_pct)

Missings = 
 FileName                                             0
FolderName                                           0
SubmissionType_XML                                   0
FilerId                                              0
FilerCcc                                             0
FilerFileNumber                                      0
ContactName                                          0
ContactPhoneNumber                                   0
ContactEmail                                         0
NotificationEmails                                   0
IsAmendment                                          0
HasMoreThanOneAdvisoryFirms                          0
NoOfAdvisoryFirms                                    0
Applicant_FirstName                                  0
Applicant_MiddleName                                 0
Applicant_LastName                                   0
ResidentialHistoryPresent                            0
CurrentEmployer_Name                                

In [None]:
print(ma_i_dataframe.columns)