## Extraction of RMH 2018 texts from xml

In [50]:
# The location of RMH to read
rmh_dir = '/work/helgasvala/data/raw/risamalheild'

# The location of where to write the results
target_dir = '/work/inga/data/rmh'

In [51]:
from pathlib import Path

rmh_dir = Path(rmh_dir)
print(rmh_dir)
target_dir = Path(target_dir)
print(target_dir)
assert rmh_dir.exists()
if not target_dir.exists():
    target_dir.mkdir()

/work/helgasvala/data/raw/risamalheild
/work/inga/data/rmh


In [52]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from pathlib import Path
from typing import Sequence
from xml.etree import ElementTree as ET

def tei_read_file(path: Path) -> Sequence[str]:
    """Reads a tei file to extract the contents. Hand-tailored to reading the RMH.

    Adjusted code from xml_tools.py from Róbert Kjaran <robert@kjaran.com>

    :param path: A pathlib.Path to the .tei file to read.\n
    :return: A List of sentences as defined in the .tei file.
    """
    NS = {'a': 'http://www.tei-c.org/ns/1.0'}
    root = ET.parse(str(path)).getroot()
    sentences = []
    # We gather all the paragraphs from the body, avoiding the divs
    for paragraph_node in root.findall('.//a:body//a:p', NS):
        for sentence_node in paragraph_node.findall('.//a:s', NS):
            tokens = [(token_node.text, token_node.attrib['type'])
                      for token_node in sentence_node.findall('./*')]
            sentence: List[str] = []
            for i, token in enumerate(tokens):
                if token[0] is None:
                    continue
                sentence.append(token[0] + ' ')
            sentences.append(''.join(sentence).strip() + '\n')
    return sentences


def tei_read(paths: Sequence[Path], out_path: Path) -> bool:
    """Reads a sequence of Path of TEI files from RMH and writes to a single file.
    Uses multiple threads.

    :param paths: A Sequence of pathlib.Path of .tei files to read.\n
    :param out_path: pathlib.Path to write the contents to.\n
    :return: True if successful.
    """
    with out_path.open('w+') as f_out:
        with ProcessPoolExecutor(max_workers=14) as executor:
            results = tqdm(executor.map(
                tei_read_file,
                paths,
                chunksize=200),
                total=len(paths))
            for result in results:
                f_out.write(''.join(result))
    return True

In [69]:
# When choosing only from a few specific data sources
from glob import glob
import os
import random

dirsDepth2 = glob(f"{rmh_dir}/*/*/")
for d2 in dirsDepth2:
    parent = Path(d2).parent
    parent_base = os.path.basename(os.path.normpath(parent))
    name = os.path.basename(os.path.normpath(d2))
    if Path(d2).is_dir():
        xml_files = glob(f"{d2}/**/*.xml", recursive=True)

        outdir=Path(target_dir).joinpath(parent_base)
        Path(outdir).mkdir(parents=True, exist_ok=True)
        rmh_txt = Path(outdir).joinpath(f'{name}.txt')
        
        tei_read([Path(tei_file) for tei_file in xml_files], rmh_txt)

100%|██████████| 23880/23880 [00:15<00:00, 1526.81it/s]
100%|██████████| 774/774 [00:04<00:00, 185.70it/s]
100%|██████████| 6092/6092 [00:29<00:00, 203.26it/s]
100%|██████████| 69991/69991 [05:31<00:00, 211.18it/s]
100%|██████████| 48556/48556 [04:52<00:00, 165.75it/s]
100%|██████████| 12292/12292 [01:14<00:00, 165.36it/s]
100%|██████████| 15/15 [00:00<00:00, 346.58it/s]
100%|██████████| 7314/7314 [00:33<00:00, 218.05it/s]
100%|██████████| 8444/8444 [00:36<00:00, 233.22it/s]
100%|██████████| 205849/205849 [20:37<00:00, 166.38it/s]
100%|██████████| 12450/12450 [01:18<00:00, 158.49it/s]
100%|██████████| 5377/5377 [00:34<00:00, 157.05it/s] 
100%|██████████| 39695/39695 [04:35<00:00, 144.15it/s]
100%|██████████| 1705/1705 [00:05<00:00, 288.86it/s]
100%|██████████| 2679/2679 [00:07<00:00, 362.76it/s] 
100%|██████████| 1027987/1027987 [1:35:09<00:00, 180.06it/s]
100%|██████████| 115/115 [01:01<00:00,  1.86it/s]
100%|██████████| 1003304/1003304 [1:27:24<00:00, 191.29it/s]
100%|██████████| 911