In [None]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

def extract_info_from_xml(xml_content):
    soup = BeautifulSoup(xml_content, 'lxml')
    extracted_data = {}

    def get_text_or_none(tag):
        return tag.get_text(strip=True) if tag else None

    def get_input_or_fakebox_value(key):
        input_tag = soup.find('input', {'name': key}) or soup.find('input', {'id': key})
        if input_tag and input_tag.has_attr('value'):
            return input_tag['value'].strip()
        label = soup.find(string=lambda text: text and key.lower() in text.lower())
        if label:
            parent = label.find_parent(['div', 'td'])
            if parent:
                fakebox = parent.find_next('div', class_='fakeBox')
                if fakebox:
                    return fakebox.get_text(strip=True)
        return None

    def get_label_value(label_text):
        label_tag = soup.find(string=lambda text: text and label_text.lower() in text.lower())
        if label_tag:
            parent = label_tag.find_parent(['div', 'td'])
            if parent:
                sibling = parent.find_next_sibling(['div', 'td'])
                return get_text_or_none(sibling)
        return None

    def extract_yes_no_by_label(label):
        section = soup.find(string=lambda text: text and label.lower() in text.lower())
        if section:
            parent = section.find_parent('div', class_='tableRow')
            if parent:
                yes = parent.find('img', {'alt': 'radio button checked'})
                return 'Yes' if yes else 'No'
        return None



    def extract_checkbox(label):
        span = soup.find('span', string=lambda text: text and label.lower() in text.lower())
        if span:
            checked = span.find_previous_sibling('img', {'src': '/Images/box-checked.jpg'})
            return 'Yes' if checked else 'No'
        return 'No'

    def extract_fallback_regex(label):
        pattern = re.compile(rf'{label}\s*[:\-]?\s*([\w\-\.@]+)', re.IGNORECASE)
        match = pattern.search(xml_content)
        return match.group(1) if match else None

    def extract_date_of_formation(soup):
        for h3 in soup.find_all('h3'):
            if 'date of organization' in h3.get_text(strip=True).lower() or 'date of formation' in h3.get_text(strip=True).lower():
                input_tag = h3.find('input', {'type': 'text'})
                if input_tag and input_tag.has_attr('value'):
                    return input_tag['value'].strip()
        return None

    def extract_state_of_incorporation(soup):
        for h3 in soup.find_all('h3'):
            if 'state of incorporation' in h3.get_text(strip=True).lower() or 'state of formation' in h3.get_text(strip=True).lower():
                input_tag = h3.find('input', {'type': 'text'})
                if input_tag and input_tag.has_attr('value'):
                    return input_tag['value'].strip()
        return None

    def extract_principal_office_zip(soup):
        postal_labels = soup.find_all(string=lambda text: text and 'postal code' in text.lower())
        for label in postal_labels:
            parent_div = label.find_parent('div')
            if parent_div:
                fakebox = parent_div.find_next('div', class_='fakeBox')
                if fakebox:
                    zip_candidate = fakebox.get_text(strip=True)
                    if re.match(r'^\d{5}(-\d{4})?$', zip_candidate):
                        return zip_candidate
        zip_input = soup.find('input', {'id': 'zip'})
        if zip_input and zip_input.has_attr('value'):
            return zip_input['value'].strip()
        return None

    def extract_type_of_organization(soup):
        """
        Extracts the selected type of organization by checking for a checked radio button
        next to known organization types.
        """
        org_types = [
            "Corporation",
            "Partnership",
            "Limited Liability Company",
            "Sole Proprietorship",
            "Limited Liability Partnership",
            "Limited Partnership",
            "Other"
        ]
        for org in org_types:
            # Look for a <div> or <td> that contains the organization type text
            candidates = soup.find_all(string=lambda text: text and org.lower() in text.lower())
            for candidate in candidates:
                parent = candidate.find_parent(['div', 'td'])
                if parent:
                    # Look for a checked radio button in the same row or nearby
                    checked_img = parent.find_next('img', {'src': '/Images/radio-checked.jpg'})
                    if checked_img:
                        return org
        return None

    def extract_website(soup):
        # 1. Try input field with id or name 'websiteAddr'
        input_tag = soup.find('input', {'id': 'websiteAddr'}) or soup.find('input', {'name': 'websiteAddr'})
        if input_tag and input_tag.has_attr('value'):
            website = input_tag['value'].strip()
            if website and website.lower().startswith(('http', 'www')) and 'w3.org' not in website.lower():
                return website

        # 2. Try fakeBox near a label containing 'website address'
        label = soup.find(string=lambda text: text and 'website address' in text.lower())
        if label:
            parent = label.find_parent(['div', 'td'])
            if parent:
                fakebox = parent.find_next('div', class_='fakeBox')
                if fakebox:
                    text = fakebox.get_text(strip=True)
                    if text and text.lower().startswith(('http', 'www')) and 'w3.org' not in text.lower():
                        return text

        # 3. Return None if nothing valid is found
        return None

    def extract_solicitation_activity(soup):
        rows = soup.find_all('div', class_='tableRow')
        for row in rows:
            label = row.find('span', string=lambda text: text and 'engage in solicitation activities' in text.lower())
            if label:
                checked = row.find_all('img', {'src': '/Images/radio-checked.jpg'})
                for img in checked:
                    text = img.find_previous(string=True)
                    if text and 'yes' in text.lower():
                        return 'Yes'
                    elif text and 'no' in text.lower():
                        return 'No'
        return None

    def extract_employee_count_robust(soup, label_text, input_key, xml_content):
        # 1. Try radio button format
        rows = soup.find_all('div', class_='tableRow')
        for row in rows:
            label = row.find(string=lambda text: text and label_text.lower() in text.lower())
            if label:
                checked = row.find('img', {'src': '/Images/radio-checked.jpg'})
                if checked:
                    value = checked.find_previous(string=True)
                    if value:
                        return value.strip()

        # 2. Try input or fakeBox
        input_tag = soup.find('input', {'name': input_key}) or soup.find('input', {'id': input_key})
        if input_tag and input_tag.has_attr('value'):
            return input_tag['value'].strip()

        label = soup.find(string=lambda text: text and input_key.lower() in text.lower())
        if label:
            parent = label.find_parent(['div', 'td'])
            if parent:
                fakebox = parent.find_next('div', class_='fakeBox')
                if fakebox:
                    return fakebox.get_text(strip=True)

        # 3. Fallback regex
        pattern = re.compile(rf'{label_text}\s*[:\-]?\s*([\w\-]+)', re.IGNORECASE)
        match = pattern.search(xml_content)
        if match:
            return match.group(1)

        return None
    

    # --- Extract Fields ---
    extracted_data['Form Name'] = get_text_or_none(soup.find('div', class_='title1'))
    extracted_data['CIK'] = get_input_or_fakebox_value('CIK') or get_input_or_fakebox_value('cik') or get_label_value('CIK') or extract_fallback_regex('CIK')
    extracted_data['Firm Legal Name'] = get_input_or_fakebox_value('legalNm') or get_label_value('Firm Name') or extract_fallback_regex('Firm Name')
    extracted_data['IRS EIN'] = get_input_or_fakebox_value('IRSId') or get_input_or_fakebox_value('irsEin') or extract_fallback_regex('IRS EIN')
    extracted_data['SEC File No. MA-T'] = get_input_or_fakebox_value('maTFileNumber') or get_input_or_fakebox_value('secFileNumMat') or extract_fallback_regex('SEC File No. MA-T')
    extracted_data['Organization CRD No.'] = get_input_or_fakebox_value('CrdNo') or get_input_or_fakebox_value('orgCrdNum') or get_label_value('Organization CRD No.') or extract_fallback_regex('Organization CRD No.')
    extracted_data['Contact Person'] = get_input_or_fakebox_value('contactPersonNm') or get_label_value('Contact Person') or extract_fallback_regex('Contact Person')
    extracted_data['Contact Phone'] = get_input_or_fakebox_value('contactPh') or get_input_or_fakebox_value('mainPh') or extract_fallback_regex('Contact Phone')
    extracted_data['Contact Email'] = get_input_or_fakebox_value('contactEmailAddr') or get_label_value('Contact Email Address') or extract_fallback_regex('Contact Email Address')
    extracted_data['Principal Office Street Address'] = get_input_or_fakebox_value('street1') or get_label_value('Street Address 1') or extract_fallback_regex('Street Address 1')
    extracted_data['Principal Office City'] = get_input_or_fakebox_value('city') or get_label_value('City') or extract_fallback_regex('City')
    extracted_data['Principal Office State'] = get_input_or_fakebox_value('state') or get_label_value('State') or extract_fallback_regex('State')
    extracted_data['Principal Office Zip'] = extract_principal_office_zip(soup) or get_label_value('Zip') or extract_fallback_regex('Zip')
#     extracted_data['Principal Office Website'] = get_input_or_fakebox_value('websiteAddr') or get_label_value('Website') or extract_fallback_regex('Website')
    extracted_data['Principal Office Website'] = extract_website(soup)


    #Type of organization
    '''
    org_types = ['Corporation', 'Partnership', 'Limited Liability Company', 'Sole Proprietorship', 'Other']
    for org in org_types:
        tag = soup.find('td', string=lambda text: text and org in text)
        if tag and tag.find('img', {'src': '/Images/radio-checked.jpg'}):
            extracted_data['Type of Organization'] = org
            break
    '''           
    extracted_data['Type of Organization'] = extract_type_of_organization(soup)


    extracted_data['Date of Formation'] = get_input_or_fakebox_value('dtOfForm') or extract_date_of_formation(soup) or extract_fallback_regex(r'Date of Formation\s*[:\-]?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})')
    extracted_data['State of Incorporation/Formation'] = get_input_or_fakebox_value('stateOfInc') or get_input_or_fakebox_value('orgJur') or extract_state_of_incorporation(soup) or extract_fallback_regex('State of Incorporation/Formation')

    # --- Compensation ---
    extracted_data['Compensation contingent on transaction closing or based on size'] = extract_yes_no_by_label('compensation')
    extracted_data['Receives compensation based on par amount'] = extract_checkbox('par amount')
    extracted_data['Receives compensation contingent on closing'] = extract_checkbox('contingent on the closing')
    extracted_data['Receives compensation based on size'] = extract_checkbox('based on the size')

    # --- Solicitation ---
#     extracted_data['Engages in solicitation activities'] = extract_yes_no_by_label2('solicitation activities')
    extracted_data['Engages in solicitation activities'] = extract_solicitation_activity(soup)


    # --- Additional Features ---
#     extracted_data['Total Number of Employees'] = get_input_or_fakebox_value('numEmpl') or extract_fallback_regex('Total Number of Employees')
#     extracted_data['Number of Municipal Advisor Employees'] = get_input_or_fakebox_value('numMaEmpl') or extract_fallback_regex('Number of Municipal Advisor Employees')
    extracted_data['Total Number of Employees'] = extract_employee_count_robust(soup, 'Total number of employees', 'numEmpl', xml_content)
    extracted_data['Number of Municipal Advisor Employees'] = extract_employee_count_robust(soup, 'Number of municipal advisor employees', 'numMaEmpl', xml_content)

    extracted_data['Number of Municipal Entity Clients'] = get_input_or_fakebox_value('numMaClients') or extract_fallback_regex('Number of Municipal Entity Clients')
    extracted_data['Number of Obligated Person Clients'] = get_input_or_fakebox_value('numObligatedClients') or extract_fallback_regex('Number of Obligated Person Clients')
    extracted_data['Total Number of Clients'] = get_input_or_fakebox_value('totNumClients') or extract_fallback_regex('Total Number of Clients')

    # --- Business Size ---
    extracted_data['Gross Revenues'] = None
    for option in [
        "Less than $1,000,000",
        "$1,000,000 to $5,000,000",
        "$5,000,001 to $25,000,000",
        "More than $25,000,000"
    ]:
        td = soup.find('td', string=option)
        if td and td.find('img', {'src': '/Images/radio-checked.jpg'}):
            extracted_data['Gross Revenues'] = option
            break

    extracted_data['Fiscal Year End Date'] = get_input_or_fakebox_value('fiscalYrEndDt') or extract_fallback_regex('Fiscal Year End Date')

    # --- Other Business Activities ---
#     extracted_data['Engages in any other business activities'] = extract_yes_no_by_label('Does the municipal advisor engage in any other business activities?')

    # --- Disclosures ---
    for label in [
        "Criminal Action Disclosure",
        "Regulatory Action Disclosure",
        "Civil Judicial Action Disclosure",
        "Customer Complaint/Arbitration/Civil Litigation Disclosure"
    ]:
        heading = soup.find('h1', string=lambda text: text and label.replace(" Disclosure", "") in text)
        form_div = heading.find_next_sibling('div', class_='form1') if heading else None
        extracted_data[label] = 'Yes' if form_div and form_div.get_text(strip=True) else 'No'

    return extracted_data

def process_multiple_ma_files(file_paths):
    all_data = []
    for file_path in file_paths:
        print(f'Processing file:{file_path}')
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                xml_content = f.read()
            data = extract_info_from_xml(xml_content)
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                xml_content = f.read()
            data = extract_info_from_xml(xml_content)
        if data:
            data['Source File'] = os.path.basename(file_path)
            all_data.append(data)
            missing_fields = [k for k, v in data.items() if v is None]
            if missing_fields:
                print(f"Missing fields in {file_path}: {missing_fields}")
    return pd.DataFrame(all_data)


# --- Example Usage 
if __name__ == "__main__":
    # Replace with the actual paths to your XML files
    # For demonstration, assume files are in a 'data' directory relative to the script
    # and use the example file name.
    # folder_path = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0002020463" #works somewhat
    # folder_path = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0001623387" #works somewhat
    # folder_path = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0001617993"
    folder_path = r"D:\PhD_Fin\Muni_adv_PBF\Data2"
    target_substring = "FormMA_X01_primary_doc"
    list_of_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if target_substring.lower() in file.lower() and file.endswith(".xml"):
                file_path = os.path.join(root, file)
                # print(f"Collecting: {file_path}")
                list_of_files.append(file_path)
        
    # dummy_file_path = "dummy_form_ma.xml"
    dummy_file_path = [r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0002020463\MA_0002020463_24_000004_xslFormMA_X01_primary_doc.xml"]
    # dummy_file_path = r"C:\Users\barid\Documents\Papers\Muni_adv_PBF\ma_filings\0001623387"
    
    # Example usage:
    # List of XML files
    # xml_files = [
    #     r'C:\Users\\MA_0002020463_24_000004_xslFormMA_X01_primary_doc.xml'
    # ]
    
    df = process_multiple_ma_files(list_of_files)
    
    # combined_dataframe = process_multiple_ma_files(xml_files)
    print("\nCombined DataFrame from multiple files:")
    df
    # print(df.to_string())

    # with open(dummy_file_path, "w", encoding="utf-8") as f:
    #     f.write(dummy_xml_content)

    # List of your XML files
    # In a real-world scenario, you would populate this list dynamically, e.g.,
    # using glob.glob("path/to/your/xml_files/*.xml")
    # xml_files = [
    #     dummy_file_path
    #     # "path/to/your/another_form_ma_file.xml",
    #     # "path/to/your/third_form_ma_file.xml",
    # ]



In [2]:
df

Unnamed: 0,Form Name,CIK,Firm Legal Name,IRS EIN,SEC File No. MA-T,Organization CRD No.,Contact Person,Contact Phone,Contact Email,Principal Office Street Address,...,Number of Municipal Entity Clients,Number of Obligated Person Clients,Total Number of Clients,Gross Revenues,Fiscal Year End Date,Criminal Action Disclosure,Regulatory Action Disclosure,Civil Judicial Action Disclosure,Customer Complaint/Arbitration/Civil Litigation Disclosure,Source File
0,FORM MA,0000009211,ROBERT W. BAIRD & CO. INCORPORATED,39-6037917,866-00120-00,8158,DeVona WrightCottrell,,dwrightcottrell@rwbaird.com,215 South Washington Square,...,,,,,,No,No,No,No,MA_0000009211_14_000018_xslFormMA_X01_primary_...
1,FORM MA,0000009211,ROBERT W. BAIRD & CO. Inc,39-6037917,866-00120-00,8158,DeVona WrightCottrell,,dwrightcottrell@rwbaird.com,124 West Allegan Street,...,,,,,,No,No,No,No,MA_0000009211_15_000011_xslFormMA_X01_primary_...
2,FORM MA,0000009211,ROBERT W. BAIRD & CO. INCORPORATED,39-6037917,866-00120-00,8158,DeVona WrightCottrell,,dwrightcottrell@rwbaird.com,124 West Allegan Street,...,,,,,,No,No,No,No,MA_0000009211_15_000026_xslFormMA_X01_primary_...
3,FORM MA,0000009211,ROBERT W. BAIRD & CO. INCORPORATED,39-6037917,866-00120-00,8158,DeVona WrightCottrell,,dwrightcottrell@rwbaird.com,124 West Allegan Street,...,,,,,,No,No,No,No,MA_0000009211_16_000043_xslFormMA_X01_primary_...
4,FORM MA,0000009211,ROBERT W. BAIRD & CO. INCORPORATED,39-6037917,866-00120-00,8158,DeVona WrightCottrell,,dwrightcottrell@rwbaird.com,124 West Allegan Street,...,,,,,,No,No,No,No,MA_0000009211_16_000048_xslFormMA_X01_primary_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7856,FORM MA,0002041221,"First Security Municipal Advisors, Inc.",33-1383623,,,Walter D. Lomax III,,blomax@crewsfs.com,625 Farnsworth Avenue,...,,,,,,No,No,No,No,MA_0002041221_24_000006_xslFormMA_X01_primary_...
7857,FORM MA,0002041221,"First Security Municipal Advisors, Inc.",33-1383623,,,Walter D. Lomax III,,blomax@crewsfs.com,625 Farnsworth Avenue,...,,,,,,No,No,No,No,MA_0002041221_25_000015_xslFormMA_X01_primary_...
7858,FORM MA,0002041221,"First Security Municipal Advisors, Inc.",33-1383623,,,Walter D. Lomax III,,blomax@crewsfs.com,625 Farnsworth Avenue,...,,,,,,No,No,No,No,MA_0002041221_25_000017_xslFormMA_X01_primary_...
7859,FORM MA,0002053271,Patriot Advisors LLC,33-3125094,,,Kevin George Quinn,,kquinn@patriot-advisors.com,,...,,,,,,No,No,No,No,MA_0002053271_25_000007_xslFormMA_X01_primary_...


In [4]:
df.to_csv(r'C:\Users\barid\Documents\Papers\Muni_adv_PBF\Data\output_step_3i.csv', index=False)

In [6]:
# reconcile to check the number of files
ctr = 0
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if target_substring.lower() in file.lower() and file.endswith(".xml"):
            file_path = os.path.join(root, file)
            print(f"File path: {file_path}")
            ctr += 1

print(f"total number of MA files: {ctr}")


File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_14_000018_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_15_000011_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_15_000026_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_16_000043_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_16_000048_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_16_000065_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_16_000069_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_17_000003_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\0000009211\MA_0000009211_18_000008_xslFormMA_X01_primary_doc.xml
File path: D:\PhD_Fin\Muni_adv_PBF\Data2\00000