## Creation of a subset of RMH 2018 for punctuation training and testing

Select text only from sources with rather reliable punctuation and structure. The sources are morgunbladid, i.e. newspaper material, textasafn_arnastofnun, i.e. published books, and 'ras1_og_2', 'ruv', 'sjonvarpid', 'stod2', 'bylgjan', which are tv and radio material, together called "Ljósvakamidlar" on malheildir.arnastofnun.is.


In [1]:
# The location of RMH to read
rmh_dir = '/work/helgasvala/data/raw/risamalheild'

# The location of where to write the results
target_dir = '/work/inga/data/rmh_subset'

In [2]:
import pathlib

rmh_dir = pathlib.Path(rmh_dir)
print(rmh_dir)
target_dir = pathlib.Path(target_dir)
assert rmh_dir.exists()
if not target_dir.exists():
    target_dir.mkdir()

/work/helgasvala/data/raw/risamalheild


In [3]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from pathlib import Path
from typing import Sequence
from xml.etree import ElementTree as ET

def tei_read_file(path: Path) -> Sequence[str]:
    """Reads a tei file to extract the contents. Hand-tailored to reading the RMH.

    Adjusted code from xml_tools.py from Róbert Kjaran <robert@kjaran.com>

    :param path: A pathlib.Path to the .tei file to read.\n
    :return: A List of sentences as defined in the .tei file.
    """
    NS = {'a': 'http://www.tei-c.org/ns/1.0'}
    root = ET.parse(str(path)).getroot()
    sentences = []
    # We gather all the paragraphs from the body, avoiding the divs
    for paragraph_node in root.findall('.//a:body//a:p', NS):
        for sentence_node in paragraph_node.findall('.//a:s', NS):
            tokens = [(token_node.text, token_node.attrib['type'])
                      for token_node in sentence_node.findall('./*')]
            sentence: List[str] = []
            for i, token in enumerate(tokens):
                if token[0] is None:
                    continue
                sentence.append(token[0] + ' ')
            sentences.append(''.join(sentence).strip() + '\n')
    return sentences


def tei_read(paths: Sequence[Path], out_path: Path) -> bool:
    """Reads a sequence of Path of TEI files from RMH and writes to a single file.
    Uses multiple threads.

    :param paths: A Sequence of pathlib.Path of .tei files to read.\n
    :param out_path: pathlib.Path to write the contents to.\n
    :return: True if successful.
    """
    with out_path.open('w+') as f_out:
        with ProcessPoolExecutor(max_workers=14) as executor:
            results = tqdm(executor.map(
                tei_read_file,
                paths,
                chunksize=200),
                total=len(paths))
            for result in results:
                f_out.write(''.join(result))
    return True

In [4]:
# When choosing only from a few specific data sources
from glob import glob
import os
import random

dir_names = ['morgunbladid', 'textasafn_arnastofnun' \
             'ras1_og_2', 'ruv', 'sjonvarpid', 'stod2', 'bylgjan']

for d in dir_names:
    p1 = os.path.join(os.path.join(rmh_dir, 'MIM'), d)
    p2 = os.path.join(os.path.join(rmh_dir, 'CC_BY'), d)
    if os.path.isdir(p1):
        xml_files = glob(f'{p1}/**/*.xml', recursive=True)
    elif os.path.isdir(p2):
        xml_files = glob(f'{p2}/**/*.xml', recursive=True)
    
    print(d, len(xml_files))
    rmh_txt = Path(target_dir).joinpath(f'rmh_{d}.txt')
    tei_read([Path(tei_file) for tei_file in xml_files], rmh_txt)

textasafn_arnastofnun 115


100%|██████████| 115/115 [00:26<00:00,  4.33it/s]


kjarninn 9110


100%|██████████| 9110/9110 [00:17<00:00, 521.74it/s] 


ras1_og_2 143232


100%|██████████| 143232/143232 [01:40<00:00, 1422.46it/s]


ruv 259570


100%|██████████| 259570/259570 [05:30<00:00, 784.26it/s] 


sjonvarpid 75786


100%|██████████| 75786/75786 [01:05<00:00, 1160.12it/s]


stod2 58189


100%|██████████| 58189/58189 [01:26<00:00, 676.55it/s] 


bylgjan 57723


100%|██████████| 57723/57723 [01:38<00:00, 584.67it/s] 
