# ParIce + RMH + EN mono - map/pre-process
Í þessu reikniriti forvinnum við öll gögnin.

In [1]:
# The location of en-mono to read
en_mono_dir = '/work/haukurpj/data/filtered/en-monolingual'

# The location of where to write the results
en_mono_target_dir = '/work/haukurpj/data/mapped/en-monolingual'

# The location of rmh to read
is_mono_dir = '/work/haukurpj/data/filtered/risamalheild'

# The location of where to write the results
is_mono_target_dir = '/work/haukurpj/data/mapped/risamalheild'

# The location of parice to read
parice_dir = '/work/haukurpj/data/filtered/Parice1.0'

# The location of where to write the results
parice_target_dir = '/work/haukurpj/data/mapped/Parice1.0'

THREADS = 14

In [2]:
import pathlib

en_mono_dir = pathlib.Path(en_mono_dir)
en_mono_target_dir = pathlib.Path(en_mono_target_dir)
is_mono_dir = pathlib.Path(is_mono_dir)
is_mono_target_dir = pathlib.Path(is_mono_target_dir)
parice_dir = pathlib.Path(parice_dir)
parice_target_dir = pathlib.Path(parice_target_dir)
assert en_mono_dir.exists()
assert is_mono_dir.exists()
assert parice_dir.exists()
if not en_mono_target_dir.exists():
    en_mono_target_dir.mkdir()
if not is_mono_target_dir.exists():
    is_mono_target_dir.mkdir()
if not parice_target_dir.exists():
    parice_target_dir.mkdir()

In [4]:
from frontend import bulk as b
from frontend import core as c
from frontend import definitions as d
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import re

URI = re.compile(r"((http(s)?:\/\/)|(www)|([-a-zA-Z0-9:%_\+.~#?&/=]+?@))+([-a-zA-Z0-9@:%_\+.~#?&/=]+)", re.IGNORECASE)
URI_SIMPLE = re.compile(r"([-a-zA-Z0-9@:%_\+.~#?&/=]+?)(\.is|\.com)", re.IGNORECASE)

reg = c.regexp
tok = c.tokenize
low = c.lowercase_normalize
second_reg = [
        d.SUB_PIPE,
        d.SUB_LT,
        d.SUB_GT,
        d.SUB_BRACKET_OPEN,
        d.SUB_BRACKET_CLOSE,
        d.SUB_FIX_PLACEHOLDERS
    ]
first_reg = [
        {
            'pattern': URI,
            'repl': '_uri_'
        },
        {
            'pattern': URI_SIMPLE,
            'repl': '_uri_'
        },
        d.SUB_EMPTY_BRACKETS
    ]

def preprocess_sent(sent, lang, method):
    sent = reg(sent, first_reg)
    sent = tok(sent, lang, method=method)
    sent = low(sent)
    sent = reg(sent, second_reg)
    return sent + '\n'

def bulk_process_sent(p_in, p_out, function):
    lang = b._lang(p_in)
    if str(lang) == 'is':
        method = "shallow"
    else:
        method = "moses"
    with p_in.open() as f_in, p_out.open('w+') as f_out:
        with ProcessPoolExecutor(max_workers=THREADS) as executor:
            results = tqdm(executor.map(
                function,
                f_in,
                chunksize=6000))
            for result in results:
                f_out.write(result)

In [5]:
from functools import partial

#bulk_process_sent(en_mono_dir.joinpath('mono.en'), 
#                  en_mono_target_dir.joinpath('mono.en'),
#                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))
bulk_process_sent(is_mono_dir.joinpath('rmh.is'),
                  is_mono_target_dir.joinpath('rmh.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('train.is'),
                  parice_target_dir.joinpath('train.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('train.en'),
                  parice_target_dir.joinpath('train.en'),
                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))
bulk_process_sent(parice_dir.joinpath('dev.is'),
                  parice_target_dir.joinpath('dev.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('dev.en'),
                  parice_target_dir.joinpath('dev.en'),
                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))
bulk_process_sent(parice_dir.joinpath('test-ees.is'),
                  parice_target_dir.joinpath('test-ees.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('test-ees.en'),
                  parice_target_dir.joinpath('test-ees.en'),
                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))
bulk_process_sent(parice_dir.joinpath('test-ema.is'),
                  parice_target_dir.joinpath('test-ema.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('test-ema.en'),
                  parice_target_dir.joinpath('test-ema.en'),
                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))
bulk_process_sent(parice_dir.joinpath('test-opensubtitles.is'),
                  parice_target_dir.joinpath('test-opensubtitles.is'),
                  partial(preprocess_sent, method='shallow', lang=c.Lang.IS))
bulk_process_sent(parice_dir.joinpath('test-opensubtitles.en'),
                  parice_target_dir.joinpath('test-opensubtitles.en'),
                  partial(preprocess_sent, method='moses', lang=c.Lang.EN))

71137920it [32:24, 36582.69it/s]
3256945it [01:16, 42844.44it/s]
3256945it [01:57, 27678.60it/s]
2000it [00:00, 5472.98it/s]
2000it [00:00, 2899.21it/s]
1930it [00:00, 3375.47it/s]
1930it [00:00, 2411.62it/s]
1963it [00:00, 3910.51it/s]
1963it [00:00, 2621.60it/s]
2059it [00:00, 9533.33it/s]
2059it [00:00, 3354.13it/s]
