In [2]:
def parse(response, infos):
    # 2. PARSE THE XML RESPONSE (Our Previous Logic)
    # ================================================
    try:
        # The response.text is the XML data
        xml_data = response.text
        
        # Parse the entire document as XML
        root = ET.fromstring(xml_data)

        # Find the <update id="form"> tag and get its text content
        # This gives us the clean HTML string
        html_string = root.find('.//update[@id="form"]').text

        # Parse the extracted HTML string with BeautifulSoup
        soup = BeautifulSoup(html_string, 'html.parser')

        # Find the results table (the colon ':' must be escaped with a backslash)
        results_table = soup.find('table', id='form:grid')
        
        if not results_table:
            print("❌ Could not find the results table in the response.")
        else:
            # Find all table rows in the tbody
            rows = results_table.find('tbody').find_all('tr')
            print(f"\nFound {len(rows)} results on this page.\n")

            # 3. EXTRACT AND DISPLAY DATA FROM THE TABLE
            # =============================================
            for i, row in enumerate(rows):
                cells = row.find_all('td')
                if len(cells) < 8: # Ensure the row has enough columns
                    continue
                
                num_ident = cells[0].text.strip()
                numero = cells[1].text.strip()
                origem = cells[2].text.strip()
                tipo_norma = cells[3].text.strip()
                data_pub = cells[4].text.strip()
                ementa = cells[5].text.strip()
                link_tag = cells[7].find('a', {'title': 'Texto Completo'})
                link_url = link_tag['href'] if link_tag else 'N/A'

                infos.append([tipo_norma, numero, data_pub, origem, ementa, link_url])

    except ET.ParseError:
        print("❌ Failed to parse XML. The response might not be in the expected format.")
        print("Response Text:\n", response.text)
    except Exception as e:
        print(f"An error occurred: {e}")

    
    return infos

In [1]:
import os
import glob
from bs4 import BeautifulSoup

# Get all HTML files from the saudelegis_pages directory
html_files = glob.glob('/home/bdcdo/Desktop/dev/sabara-doencas-raras/v3-consolidacao/saudelegis_pages/*.html')
print(f"Found {len(html_files)} HTML files to process:")
for file in html_files:
    print(f"  - {os.path.basename(file)}")

# Initialize list to store all parsed information
all_infos = []

# Process each HTML file
for html_file in html_files:
    print(f"\nProcessing: {os.path.basename(html_file)}")
    
    # Read the HTML file
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the results table
    results_table = soup.find('table', id='form:grid')
    
    if not results_table:
        print(f"❌ Could not find the results table in {os.path.basename(html_file)}")
        continue
    
    # Find all table rows in the tbody
    tbody = results_table.find('tbody')
    if not tbody:
        print(f"❌ Could not find tbody in {os.path.basename(html_file)}")
        continue
        
    rows = tbody.find_all('tr')
    print(f"Found {len(rows)} results in this file")
    
    # Extract data from each row
    for i, row in enumerate(rows):
        cells = row.find_all('td')
        if len(cells) < 8:  # Ensure the row has enough columns
            continue
        
        num_ident = cells[0].text.strip()
        numero = cells[1].text.strip()
        origem = cells[2].text.strip()
        tipo_norma = cells[3].text.strip()
        data_pub = cells[4].text.strip()
        ementa = cells[5].text.strip()
        link_tag = cells[7].find('a', {'title': 'Texto Completo'})
        link_url = link_tag['href'] if link_tag else 'N/A'
        
        all_infos.append([tipo_norma, numero, data_pub, origem, ementa, link_url])

print(f"\n✅ Processing complete! Total records collected: {len(all_infos)}")
print(f"Sample records:")
for i, info in enumerate(all_infos[:3]):  # Show first 3 records
    print(f"  {i+1}. {info[0]} - {info[1]} ({info[2]})")

Found 5 HTML files to process:
  - search_results_page_1_20250724_230036.html
  - search_results_initial_20250724_230036.html
  - search_results_page_2_20250724_230036.html
  - search_results_initial_20250724_230308.html
  - search_results_page_3_20250724_230036.html

Processing: search_results_page_1_20250724_230036.html
Found 10 results in this file

Processing: search_results_initial_20250724_230036.html
Found 10 results in this file

Processing: search_results_page_2_20250724_230036.html
Found 10 results in this file

Processing: search_results_initial_20250724_230308.html
Found 9 results in this file

Processing: search_results_page_3_20250724_230036.html
Found 10 results in this file

✅ Processing complete! Total records collected: 49
Sample records:
  1. PRT - 514 (28/04/2023)
  2. PRT - 1240 (13/09/2023)
  3. PRT - 887 (18/07/2023)


In [4]:
import pandas as pd

df = pd.DataFrame(all_infos, columns=['tipo_norma', 'numero', 'data_pub', 'origem', 'ementa', 'link_url'])

In [7]:
df.to_csv('ms.csv')