In [9]:
import xml.etree.ElementTree as ET
import os
import glob

In [28]:
def extract_sentences_from_xml(file_path):
    # Define the namespace
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Parse the XML content from the file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Find all <seg> elements with type="original" and extract their text
    sentences = []
    for seg in root.findall('.//tei:seg[@type="original"]', ns):
        if seg.text:
            sentences.append(seg.text.strip())

    return sentences

def process_files(directory, dutch_output_file, english_output_file):
    # Find all files ending with -en-tei.xml or -nl-tei.xml
    en_files = glob.glob(os.path.join(directory, '*-en-tei.xml'))
    nl_files = glob.glob(os.path.join(directory, '*-nl-tei.xml'))
    
    # Match files based on IDs and process pairs
    for en_file in en_files:
        # Extract ID from the English file name
        en_id = os.path.basename(en_file)[:13]  # Extract the common identifier from the English filename
        for nl_file in nl_files:
            # Extract ID from the Dutch file name
            nl_id = os.path.basename(nl_file)[:13]  # Extract the common identifier from the Dutch filename
            # Check if IDs match
            if en_id == nl_id:
                # Process English files and append sentences to the output file
                sentences = extract_sentences_from_xml(en_file)
                with open(dutch_output_file, 'a', encoding='utf-8') as dutch_output:
                    for sentence in sentences:
                        dutch_output.write(sentence + '\n')
                # Process Dutch files and append sentences to the output file
                sentences = extract_sentences_from_xml(nl_file)
                with open(english_output_file, 'a', encoding='utf-8') as english_output:
                    for sentence in sentences:
                        english_output.write(sentence + '\n')
    

In [11]:
def clean_directory(directory):
    # Define the patterns to keep and remove
    patterns_to_keep = ['*en-tei.xml', '*nl-tei.xml']
    pattern_to_remove = '*-nl-en-tei.xml'

    # Create a set of files to keep based on the patterns
    files_to_keep = set()
    for pattern in patterns_to_keep:
        files_to_keep.update(glob.glob(os.path.join(directory, pattern)))

    # List all files in the directory
    all_files = glob.glob(os.path.join(directory, '*'))

    # Determine files to remove
    files_to_remove = set(glob.glob(os.path.join(directory, pattern_to_remove)))
    files_to_remove.update(set(all_files) - files_to_keep)

    # Remove files that do not match the patterns to keep or match the pattern to remove
    for file_path in files_to_remove:
        os.remove(file_path)
        print(f'Removed: {file_path}')


In [30]:
base_path = 'core' 

# Get all directories in the base path
directories = [
    "aby", "arc", "bal", "bco", "bek", "bev", "bmm", "bos", "cam", "dns",
    "eli", "erp", "eup", "fda", "fsz", "gaz", "gim", "goh", "gra", "gru",
    "ibm", "ind", "ing", "jus", "kam", "kok", "kon", "lan", "lux", "med",
    "mel", "mis", "mok", "mvg", "nmb", "nmr", "ons", "pnl", "pos", "qty",
    "riz", "rou", "sta", "svb", "vfl", "vhs", "vla", "wat", "wst"
]


# Define output files
english_output_file = 'DPC/english_sentences.txt'
dutch_output_file = 'DPC/dutch_sentences.txt'


# Ensure the output directory exists
os.makedirs(os.path.dirname(english_output_file), exist_ok=True)
os.makedirs(os.path.dirname(dutch_output_file), exist_ok=True)

# Ensure output files are empty before starting
open(english_output_file, 'w').close()
open(dutch_output_file, 'w').close()

# Process each directory
for folder in os.listdir(base_path):
    directory = os.path.join(base_path, folder)
    
    # Process files in the directory
    process_files(directory, english_output_file, dutch_output_file)
