In [5]:
from bs4 import BeautifulSoup
import os
import re

with open('raw_data/CRE-10-2024-10-09_EN.xml', 'r', encoding='utf-8') as file:
    content = file.read()
    soup = BeautifulSoup(content, 'xml')  # Parse the XML with lxml parser in BeautifulSoup

In [6]:
def find_relevant_chapters(soup):
    # Find all relevant chapters - those which contain debates
    relevant_chapters = []
    
    # Find all TL-CHAP tags where language (VL attribute) is 'EN'
    chapters = soup.find_all('CHAPTER')
    
    for chapter in chapters:
        chapter_number = chapter.get('NUMBER')
        title = chapter.find_all('TL-CHAP', {'VL': 'EN'})
        heading = title[0].get_text(strip=True)
        
        if "(debate)" in heading or "(continuation of debate)" in heading or "(topical debate)" in heading or "(short presentation)" in heading:
            relevant_chapters.append(chapter)
            #print(chapter_number)
            #print(heading)
            
            # Find subchapters
            next_sibling = chapter.find_next_sibling()
            while next_sibling and next_sibling.get('NUMBER') and chapter_number in next_sibling.get('NUMBER'):
                title_sibling = next_sibling.find_all('TL-CHAP', {'VL': 'EN'})
                heading_sibling = title_sibling[0].get_text(strip=True)
                #print(next_sibling.get('NUMBER'))
                #print(heading_sibling)
                relevant_chapters.append(next_sibling)
                next_sibling = next_sibling.find_next_sibling()

    return relevant_chapters

In [7]:
# Function to sanitize and shorten strings
def sanitize_and_shorten_string(s, max_length=100):
    sanitized = re.sub(r'[<>:"/\\|?*] ', '-', s)  # Sanitize
    sanitized = sanitized.replace("/","-")
    sanitized = sanitized.replace(" ","-")
    sanitized = sanitized.replace("&","-")
    return sanitized if len(sanitized) <= max_length else sanitized[:max_length] + "_LIMIT"


def processing_relevant_chapters(relevant_chapters):
    
    # Step 1: Find all INTERVENTION tags in the chapter
    for chapter in relevant_chapters:
        interventions = chapter.find_all('INTERVENTION')

        for intervention in interventions:
            # Filter speeches in English
            language = intervention.find('ORATEUR').find('LG')
            if language and language.get_text(strip=True) == 'EN':
                
                # Extract speaker information
                speaker = intervention.find('ORATEUR')
                if speaker:
                    # Get a name
                    speaker_name = speaker.get('LIB').split(" |")
                    if len (speaker_name) == 1:
                        speaker_name = speaker_name[0]
                    else:
                        speaker_name = speaker_name[0] + speaker_name[1]

                    speaker_name = sanitize_and_shorten_string(speaker_name)

                    # Get a party
                    if speaker_name == "President":
                        speaker_party = "President"
                    else:
                        speaker_party = speaker.get('PP')
                        if speaker_party == "NULL":
                            emphas_tag = intervention.find('EMPHAS', {'NAME': 'I'})
                            if emphas_tag:
                                speaker_party = emphas_tag.get_text(strip=True)

                    speaker_party = sanitize_and_shorten_string(speaker_party)

                    #print(speaker_name)
                    #print(speaker_party)
                    
                    # Extract speech text
                    paragraphs = intervention.find_all('PARA')
                    speech_text = " ".join([para.get_text(strip=True) for para in paragraphs])
                    speech_text = speech_text.split("–")
                    speech_text = " ".join(speech_text[1:]) if len(speech_text) > 1 else speech_text[0]
                    speech_text = speech_text.strip()

                    #print(speech_text)

                    # Create a file name 
                    chapter_heading = chapter.find('TL-CHAP', {'VL': 'EN'}).get_text(strip=True)
                    chapter_heading = sanitize_and_shorten_string(chapter_heading)
                    print(chapter_heading)
                    
                    chapter_date = chapter.find('TL-CHAP', {'VL': 'EN'}).get("VOD-END").split("T")[0]
                    #print(chapter_date)
                    
                    file_name = f"{speaker_party}_{speaker_name}.txt"
                    #print(file_name)

                    
                    # Create directory path if it doesn't exist
                    directory_path = f"speeches/{chapter_date}/{chapter_heading}"
                    os.makedirs(directory_path, exist_ok=True)

                    # Error handling for file creation
                    try:
                        with open(f"{directory_path}/{file_name}", 'w', encoding='utf-8') as f:
                            f.write(speech_text)
                        #print(f"File saved: {directory_path}/{file_name}")
                    except Exception as e:
                        print(f"Error writing file {file_name}: {e}")

                    #print()

In [8]:
processing_relevant_chapters(find_relevant_chapters(soup))

Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programme-of-activities-of-the-Hungarian-Presidency-(debate)
Presentation-of-the-programm