# Einhliða málheild fyrir íslensku - RMH
Hér er unnið með RMH. Gangasafnið lesið og sett í staðlað form. Fyrir RMH lesum við öll tei-XML skjölin úr RMH1 og RMH2 og sameinum í eitt skjal.

Gert er ráð fyrir að eftirfarandi gögn séu til staðar
- RMH eins og hún er sótt af málföng

Eftir að hafa keyrt reikniritið verður `target_dir` eftirfarandi
- `target_dir/rmh.is`, ein skrá þar sem er búið að sameina RMH í eitt stórt skjal.

In [1]:
# The location of RMH to read
rmh_dir = '/work/haukurpj/data/raw/risamalheild'

# The location of where to write the results
target_dir = '/work/haukurpj/data/formatted/risamalheild'

from glob import glob
from pprint import pprint

In [2]:
import pathlib

rmh_dir = pathlib.Path(rmh_dir)
target_dir = pathlib.Path(target_dir)
assert rmh_dir.exists()
if not target_dir.exists():
    target_dir.mkdir()

In [3]:
xml_files = glob(f'{rmh_dir}/**/*.xml', recursive=True)
print(len(xml_files))

4152480


In [4]:
tei_files = []
for xml_file in xml_files:
    if 'rmh2Hdr.xml' in xml_file:
        continue
    elif 'rmh1Hdr.xml' in xml_file:
        continue
    else:
        tei_files.append(xml_file)

In [7]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from pathlib import Path
from typing import Sequence
from xml.etree import ElementTree as ET

def tei_read_file(path: Path) -> Sequence[str]:
    """Reads a tei file to extract the contents. Hand-tailored to reading the RMH.

    Adjusted code from xml_tools.py from Róbert Kjaran <robert@kjaran.com>

    :param path: A pathlib.Path to the .tei file to read.\n
    :return: A List of sentences as defined in the .tei file.
    """
    NS = {'a': 'http://www.tei-c.org/ns/1.0'}
    root = ET.parse(str(path)).getroot()
    sentences = []
    # We gather all the paragraphs from the body, avoiding the divs
    for paragraph_node in root.findall('.//a:body//a:p', NS):
        for sentence_node in paragraph_node.findall('.//a:s', NS):
            tokens = [(token_node.text, token_node.attrib['type'])
                      for token_node in sentence_node.findall('./*')]
            sentence: List[str] = []
            for i, token in enumerate(tokens):
                if token[0] is None:
                    continue
                sentence.append(token[0] + ' ')
            sentences.append(''.join(sentence).strip() + '\n')
    return sentences


def tei_read(paths: Sequence[Path], out_path: Path) -> bool:
    """Reads a sequence of Path of TEI files from RMH and writes to a single file.
    Uses multiple threads.

    :param paths: A Sequence of pathlib.Path of .tei files to read.\n
    :param out_path: pathlib.Path to write the contents to.\n
    :return: True if successful.
    """
    with out_path.open('w+') as f_out:
        with ProcessPoolExecutor(max_workers=14) as executor:
            results = tqdm(executor.map(
                tei_read_file,
                paths,
                chunksize=200),
                total=len(paths))
            for result in results:
                f_out.write(''.join(result))
    return True

In [8]:
rmh_txt = Path(target_dir).joinpath('rmh.is')
tei_read([Path(xml_file) for xml_file in xml_files], rmh_txt)

100%|██████████| 4152480/4152480 [10:27<00:00, 6613.68it/s] 


True

In [10]:
!ls -l {target_dir}/rmh.is

-rw-r--r-- 1 haukurpj local-staff 8764458500 Jan 20 16:24 /work/haukurpj/data/formatted/risamalheild/rmh.is
