# Data preparation
Í þessu reikniriti er tekið við gögnum eftir fyrstu samhæfinu og unnin frekar.

In [3]:
from collections import defaultdict, Counter, OrderedDict
import os
import pathlib
from pathlib import Path
import re
from pprint import pprint
import importlib
from typing import List, Sequence

import matplotlib.pyplot as plt
import numpy as np

import frontend.core as c
import frontend.bulk as b
import frontend.definitions as d

c.THREADS = 6

working_dir = pathlib.Path('/work/haukurpj')
data_dir = working_dir.joinpath('data')
processing_dir = working_dir.joinpath('process')
p = processing_dir
parice_dir = data_dir.joinpath('parice')
rmh_dir = data_dir.joinpath('risamalheild')
train_dir = p.joinpath('train')
test_dir = p.joinpath('test')
dev_dir = p.joinpath('dev')

IS = c.Lang.IS
EN = c.Lang.EN

RMH, PARICE = 'rmh', 'parice'
EES, EMA, OPENSUB = 'ees', 'ema', 'opensubtitles'
TRAIN, DEV, TEST = 'train-dev', 'dev', 'test'

langs = [IS, EN]
splits = [TRAIN, DEV, TEST]

SENT_FIX = 'sent_fix'
PROCESSED = 'processed'
LOWER = 'lower'

FINAL = 'final'

In [17]:
from functools import partial

URI = re.compile(r"((http(s)?:\/\/)|(www)|([-a-zA-Z0-9:%_\+.~#?&/=]+?@))+([-a-zA-Z0-9@:%_\+.~#?&/=]+)", re.IGNORECASE)
URI_SIMPLE = re.compile(r"([-a-zA-Z0-9@:%_\+.~#?&/=]+?)(\.is|\.com)", re.IGNORECASE)

def preprocess_sent(sent, lang, method):
    # print(sent)
    
    regexps = [
        {
            'pattern': URI,
            'repl': '_uri_'
        },
        {
            'pattern': URI_SIMPLE,
            'repl': '_uri_'
        },
        d.SUB_EMPTY_BRACKETS,
        {
            'pattern': re.compile(r"(\d+(.\d+)?)(mgr|gr|skv|og|eða|til|með|janúar|febrúar|mars|apríl|maí|júní|júlí|ágúst|september|október|nóvember|desember)", re.IGNORECASE),
            'repl': r"\1. \3",
        },
        {
            'pattern': re.compile(r"(skv)(?=[^.])"),
            'repl': r"\1."
        },
        {
            'pattern': re.compile(r"(\d+(.\d+)?\. )(mgr|gr)(?=[^.])", re.IGNORECASE),
            'repl': r"\1\3. "
        },
    ]
    sent = c.regexp(sent, regexps)
    sent = c.tokenize(sent, lang, method=method)
    sent = c.lowercase_normalize(sent)
    regexps = [
        d.SUB_PIPE,
        d.SUB_LT,
        d.SUB_GT,
        d.SUB_BRACKET_OPEN,
        d.SUB_BRACKET_CLOSE,
        d.SUB_FIX_PLACEHOLDERS
    ]
    sent = c.regexp(sent, regexps)
    return sent

def bulk_preprocess_sent(p_in, p_out):
    lang = b._lang(p_in)
    if lang == IS:
        method = "shallow"
    else:
        method = "moses"
    b.in_parallel(p_in, 
                  p_out,
                  c.THREADS,
                  partial(preprocess_sent, lang=lang, method=method),
                  chunksize = 10000)

In [6]:
from functools import partial

def simple_preprocess_sent(sent, lang, method):
    sent = c.tokenize(sent, lang, method=method)
    sent = c.lowercase_normalize(sent)
    regexps = [
        d.SUB_PIPE,
        d.SUB_LT,
        d.SUB_GT,
        d.SUB_BRACKET_OPEN,
        d.SUB_BRACKET_CLOSE,
    ]
    sent = c.regexp(sent, regexps)
    return sent

def lowercase_sent(sent, lang):
    return c.lowercase_normalize(sent)

def bulk_lower_sent(p_in, p_out):
    lang = b._lang(p_in)
    if lang == IS:
        method = "shallow"
    else:
        method = "moses"
    b.in_parallel(p_in, 
                  p_out,
                  c.THREADS,
                  partial(lowercase_sent, lang=lang),
                  chunksize = 10000)
    
def bulk_preprocess_sent(p_in, p_out):
    lang = b._lang(p_in)
    if lang == IS:
        method = "shallow"
    else:
        method = "moses"
    b.in_parallel(p_in, 
                  p_out,
                  c.THREADS,
                  partial(simple_preprocess_sent, lang=lang, method=method),
                  chunksize = 10000)

In [12]:
#bulk_preprocess_sent(b.read(train_dir, IS, TRAIN), b.write(train_dir, IS, PROCESSED))
#bulk_preprocess_sent(b.read(train_dir, EN, TRAIN), b.write(train_dir, EN, PROCESSED))
#bulk_preprocess_sent(b.read(dev_dir, IS, DEV), b.write(dev_dir, IS, PROCESSED))
#bulk_preprocess_sent(b.read(dev_dir, EN, DEV), b.write(dev_dir, EN, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, IS, EES), b.write(working_dir, IS, EES, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, EN, EES), b.write(working_dir, EN, EES, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, IS, EMA), b.write(working_dir, IS, EMA, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, EN, EMA), b.write(working_dir, EN, EMA, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, IS, OPENSUB), b.write(working_dir, IS, OPENSUB, PROCESSED))
bulk_preprocess_sent(b.read(working_dir, EN, OPENSUB), b.write(working_dir, EN, OPENSUB, PROCESSED))
#bulk_preprocess_sent(b.read(p, IS, RMH, SENT_FIX), b.write(p, IS, RMH, FINAL, TOK_LOW))
#bulk_preprocess_sent(b.read(p, EN, 'mono'), b.write(p, EN, 'mono', FINAL, TOK_LOW))

100%|██████████| 1930/1930 [00:00<00:00, 3650.37it/s]
100%|██████████| 1930/1930 [00:00<00:00, 2653.68it/s]
100%|██████████| 1963/1963 [00:00<00:00, 5399.93it/s]
100%|██████████| 1963/1963 [00:00<00:00, 2896.50it/s]
100%|██████████| 2059/2059 [00:00<00:00, 11307.56it/s]
100%|██████████| 2059/2059 [00:00<00:00, 3219.93it/s]


Create the test set we will use to compare with - the ground truth.

In [10]:
OPENSUB = 'opensubtitles'
bulk_lower_sent(b.read(working_dir, IS, EES), b.write(working_dir, IS, EES, LOWER))
bulk_lower_sent(b.read(working_dir, EN, EES), b.write(working_dir, EN, EES, LOWER))
bulk_lower_sent(b.read(working_dir, IS, EMA), b.write(working_dir, IS, EMA, LOWER))
bulk_lower_sent(b.read(working_dir, EN, EMA), b.write(working_dir, EN, EMA, LOWER))
bulk_lower_sent(b.read(working_dir, IS, OPENSUB), b.write(working_dir, IS, OPENSUB, LOWER))
bulk_lower_sent(b.read(working_dir, EN, OPENSUB), b.write(working_dir, EN, OPENSUB, LOWER))

100%|██████████| 1930/1930 [00:00<00:00, 106276.92it/s]
100%|██████████| 1930/1930 [00:00<00:00, 134847.11it/s]
100%|██████████| 1963/1963 [00:00<00:00, 120394.50it/s]
100%|██████████| 1963/1963 [00:00<00:00, 520312.11it/s]
100%|██████████| 2059/2059 [00:00<00:00, 215955.79it/s]
100%|██████████| 2059/2059 [00:00<00:00, 246477.31it/s]


We are now done processing the val, test, EN mono and RMH datasets.

In [6]:
!head {b.read(p, EN, 'mono', FINAL, TOK_LOW)}

resumption of the session
i declare resumed the session of the european parliament adjourned on friday , 15 december 2000 .
statements by the president
ladies and gentlemen , on saturday , as you know , an earthquake struck central america once again , with tragic consequences .
this is an area which has already been seriously affected on a number of occasions since the beginning of the twentieth century .
the latest , provisional , figures for victims in el salvador are already very high .
there are 350 people dead , 1 200 people missing , the area is completely devastated and thousands of homes have been destroyed throughout the country .
the european union has already shown its solidarity by sending a rescue team to the area , whilst financial assistance from the union and member states has been , or is in the process of being , released and i am able to inform you that some groups in the european parliament have requested that this issue be included in the debate on topical 

## Fjarlægja slæmar línur
Núna búum við til safn af "íslenskum" orðum frá RMH og förum yfir íslenskar setningar í ParIce og athugum hversu stórt hlutfall af orðunum í ParIce-IS eru í safninu okkar.

In [30]:
is_counter_1 = b.token_counter(b.read(p, IS, PARICE, TRAIN, PROCESSED))
print(len(is_counter_1))

655511


In [12]:
rmh_counter = b.token_counter(b.read(p, IS, RMH, PROCESSED))
is_words = set(rmh_counter.keys())
len(is_words)

6073000

In [14]:
# Setting the chunksize higher is better here.
print(b.CHUNKSIZE)
b.CHUNKSIZE = 50000
print(b.CHUNKSIZE)

4000
50000


In [25]:
skip_lines = b.get_drop_lines(b.read(p, IS, PARICE, TRAIN, PROCESSED),
                              [c.REGEXP_SUB['CRYLLIC'][0],
                               c.REGEXP_SUB['GREEK'][0],
                               c.REGEXP_SUB['UNKNOWN-CHARS'][0]
                              ],
                              is_words,
                              keep_ratio=0.8,
                              normalize=True,
                              keep_sent_length=1)


100%|██████████| 3540825/3540825 [02:30<00:00, 23545.40it/s] 


In [26]:
lines = [number for number, fraction, line in skip_lines]
print("new fraction", (1 - (b.info(b.read(p, IS, PARICE, TRAIN, PROCESSED))[2] - len(lines))/b.info(b.read(p, IS, PARICE, TRAIN, PROCESSED))[2]))

new fraction 0.04822237755325387


In [28]:
b.drop_lines(b.read(p, IS, PARICE, TRAIN, PROCESSED),
             b.write(p, IS, PARICE, TRAIN, FINAL),
             lines_in=lines)
b.drop_lines(b.read(p, EN, PARICE, TRAIN, PROCESSED),
             b.write(p, EN, PARICE, TRAIN, FINAL),
             lines_in=lines)

True

In [32]:
is_counter_2 = b.token_counter(b.read(p, IS, PARICE, TRAIN, FINAL))
print(len(is_counter_2))
print(1-(len(is_counter_2)/(len(is_counter_1))))

580166
0.11494086292983641
