# Bundestag XML Parser

This notebook fetches and parses Bundestag plenary protocol XML files into Python dictionaries using the `xmltodict` library.


In [None]:
#!pip install xmltodict

In [14]:
!pwd

/Users/acb/code/project


In [5]:
# import requests
import xmltodict
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_chroma import Chroma
# from langchain_text_splitters import RecursiveCharacterTextSplitter


In [None]:
# from dotenv import load_dotenv
# load_dotenv() 

## Functions


In [2]:
def fetch_and_parse_xml(url: str) -> dict:
    """
    Fetch XML from URL and parse it into a dictionary using xmltodict.
    Args:
        url: URL of the XML file    
    Returns:
        Dictionary representation of the XML
    """
    print(f"Fetching XML from {url}...")
    response = requests.get(url)
    response.raise_for_status()
    
    print("Parsing XML with xmltodict...")
    xml_dict = xmltodict.parse(response.content)
    
    return xml_dict

In [3]:
def get_speeches_by_fraktion(xml_data: dict, fraktion: str) -> list:
    """
    Retrieve all speeches by speakers from a specific party (fraktion).
    Args:
        xml_data: The parsed XML dictionary
        fraktion: The party name (e.g., "AfD", "SPD", "CDU/CSU", "BÜNDNIS 90/DIE GRÜNEN", "Die Linke")   
    Returns:
        List of speech dictionaries from speakers of the specified party
    """
    speeches = []
    
    def find_speeches_recursive(obj):
        """Recursively find all rede (speech) elements."""
        if isinstance(obj, dict):
            # Check if this is a rede element
            if '@id' in obj and obj.get('@id', '').startswith('ID'):
                # This looks like a speech element
                # Check if it has a speaker with the matching fraktion
                if 'p' in obj:
                    paragraphs = obj['p'] if isinstance(obj['p'], list) else [obj['p']]
                    for para in paragraphs:
                        if isinstance(para, dict) and 'redner' in para:
                            redner = para['redner']
                            if isinstance(redner, dict) and 'name' in redner:
                                name_info = redner['name']
                                if isinstance(name_info, dict) and name_info.get('fraktion') == fraktion:
                                    speeches.append(obj)
                                    break
                
            # Recursively search all values
            for value in obj.values():
                find_speeches_recursive(value)
                
        elif isinstance(obj, list):
            for item in obj:
                find_speeches_recursive(item)
    
    find_speeches_recursive(xml_data)
    return speeches


def extract_speech_text(speech_dict: dict) -> str:
    """
    Extract ONLY the actual speech text content (#text fields) from paragraphs.
    Excludes speaker info, XML structure markers, and metadata.
    This is what gets embedded - metadata is stored separately for filtering.
    Args:
        speech_dict: Dictionary containing speech data    
    Returns:
        String containing only the speech text content (no metadata)
    """
    # Only extract #text from paragraphs, excluding speaker introductions
    text_parts = []
    
    if 'p' in speech_dict:
        paragraphs = speech_dict['p'] if isinstance(speech_dict['p'], list) else [speech_dict['p']]
        
        for para in paragraphs:
            if isinstance(para, dict):
                # Skip paragraphs that contain speaker info (redner key)
                if 'redner' in para:
                    continue
                
                # Only extract #text content
                if '#text' in para:
                    text = para['#text'].strip()
                    # Filter out empty text and XML class markers
                    if text and len(text) > 0:
                        text_parts.append(text)
    
    # Join all text parts with spaces
    return ' '.join(text_parts)


def get_speaker_info(speech_dict: dict) -> dict:
    """
    Extract speaker information from a speech.
    
    Args:
        speech_dict: Dictionary containing speech data
        
    Returns:
        Dictionary with speaker information (name, fraktion, etc.)
    """
    if 'p' in speech_dict:
        paragraphs = speech_dict['p'] if isinstance(speech_dict['p'], list) else [speech_dict['p']]
        for para in paragraphs:
            if isinstance(para, dict) and 'redner' in para:
                redner = para['redner']
                if isinstance(redner, dict) and 'name' in redner:
                    name_info = redner['name']
                    return {
                        'titel': name_info.get('titel', ''),
                        'vorname': name_info.get('vorname', ''),
                        'nachname': name_info.get('nachname', ''),
                        'fraktion': name_info.get('fraktion', '')
                    }
    return {}

                    

## Parse XML


In [6]:
# URL of the Bundestag XML file
url = "https://www.bundestag.de/resource/blob/1127848/21042.xml"

# Fetch and parse XML
xml_data = fetch_and_parse_xml(url)


Fetching XML from https://www.bundestag.de/resource/blob/1127848/21042.xml...
Parsing XML with xmltodict...


## Get Speeches by Party (Fraktion)


In [16]:
dielinke_sample_speech = get_speeches_by_fraktion(xml_data, "Die Linke")[2]
#get_speaker_information(t)
extract_speech_text(dielinke_sample_speech)[:500]
get_speaker_info(dielinke_sample_speech)['fraktion']


'Die Linke'