# Bæta við mörkun og lemmun
Hér er unnið með málheildirnar. Mörkun og lemmun er bætt við gögnin.

Gert er ráð fyrir að það sé búið að sía gögnin.
- Síuð gögn, `input_dir`

Eftir að hafa keyrt reikniritið verður `target_dir` eftirfarandi.
- `target_dir/x.tsv.(is|en)`, þar sem `x` hefur sama nafn og í `input_dir`.
Það er, útkomurnar eru `.tsv` skjöl með þremur dálkum. Fyrsti dálkurinn er tilreiddur texti, annar dálkurinn eru mörk tókanna, þriðji er lemmur orðanna. Bil er skrifað á milli hvers tóka, marks og lemmu.

Reikniritinu er skipt í eftirfarandi skref:
1. Uppsetning
2. Málheildir lesnar
3. Bætti við málheildir
4. Málheildir skrifaðar.

## 1. Uppsetning

In [1]:
# The location of corpus to read
input_dir = '/work/haukurpj/data/filtered/Parice1.0'

# The location of where to write the results
target_dir = '/work/haukurpj/data/filtered/Parice1.0'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pathlib

input_dir = pathlib.Path(input_dir)
target_dir = pathlib.Path(target_dir)

assert input_dir.exists()
if not target_dir.parent.exists():
    target_dir.parent.mkdir()
    target_dir.mkdir()

## 2. Málheildir lesnar

In [2]:
from typing import List, Tuple
Sentence = List
POS = List
Lemma = List

Corpus = List[Sentence]
langs = ['en', 'is']

EnrichedSentence = Tuple[Sentence, POS, Lemma]
EnrichedCorpus = List[EnrichedSentence]

In [3]:
def path_to_corpus(in_path: str) -> Corpus:
    with open(in_path) as f_in:
        return f_in.readlines()

In [4]:
from glob import glob

input_corpora = {
    'en': {
        corpus: path_to_corpus(corpus) for corpus in glob(f'{input_dir}/*.en')
    },
    'is': {
        corpus: path_to_corpus(corpus) for corpus in glob(f'{input_dir}/*.is')
    }
}

In [6]:
for lang in langs:
    input_corpora[lang].keys()

dict_keys(['/work/haukurpj/data/filtered/Parice1.0/test-opensubtitles.en', '/work/haukurpj/data/filtered/Parice1.0/test-ees.en', '/work/haukurpj/data/filtered/Parice1.0/train.en', '/work/haukurpj/data/filtered/Parice1.0/test-ema.en', '/work/haukurpj/data/filtered/Parice1.0/dev.en'])

dict_keys(['/work/haukurpj/data/filtered/Parice1.0/test-opensubtitles.is', '/work/haukurpj/data/filtered/Parice1.0/test-ees.is', '/work/haukurpj/data/filtered/Parice1.0/test-ema.is', '/work/haukurpj/data/filtered/Parice1.0/train.is', '/work/haukurpj/data/filtered/Parice1.0/dev.is'])

## 3. Bæta við mörkun og lemmun

In [10]:
from time import time
import requests
import nltk
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def make_chunks(sentences, chunksize):
    for i in range(0, len(sentences), chunksize):
        yield sentences[i:i + chunksize]

def enrich_corpus(corpus: Corpus, lang: str, chunksize=4000) -> EnrichedCorpus:
    """Enrich the given corpus with POS and lemma.
    
    English processing is offline.
    Icelandic processing is done via online API.
    """
    fully_enriched_corpus = []
    function = None
    if lang == 'en':
        function = enrich_sentences_en
    elif lang == 'is':
        function = enrich_sentences_is
        
    for chunk in make_chunks(corpus, chunksize):
        start = time()
        fully_enriched_corpus.extend(function(chunk))
        end = time()
        print(f"Bulk enrichment took={end - start:.2f}")

    return fully_enriched_corpus

URL = 'http://malvinnsla.arnastofnun.is'

def enrich_sentences_is(corpus: Corpus) -> EnrichedCorpus:
    enriched_sentences = []
    json = requests.post(URL, data={
        'text': "".join(corpus),
        'lemma': 'on'
    }).json()
    for paragraph in json['paragraphs']:
        forms: Sentence = []
        poss: POS = []
        lemmas: Lemma = []
        for sentence in paragraph['sentences']:
            for token in sentence:
                forms.append(token['word'])
                poss.append(token['tag'])
                lemmas.append(token['lemma'])
        enriched_sentences.append((forms, poss, lemmas))
    return enriched_sentences

def enrich_sentences_en(corpus: Corpus) -> EnrichedSentence:
    lmtzr = WordNetLemmatizer()
    enriched_sentences = []
    for sentence in corpus:
        forms: Sentence = []
        poss: POS = []
        lemmas: Lemma = []
        tokens = word_tokenize(sentence)
        for token, tag in pos_tag(tokens):
            lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
            forms.append(token)
            poss.append(tag)
            lemmas.append(lemma.lower())
        enriched_sentences.append((forms, poss, lemmas))
    return enriched_sentences

In [11]:
import requests
def test_api(sentences):
    return requests.post(URL, data={
        'text': "".join(sentences),
        'lemma': 'on'
    }).json()

test = ["Einn. Tveir.\n","Þrír.\n"]
test_api(test)

{'paragraphs': [{'sentences': [[{'lemma': 'einn',
      'tag': 'tfken',
      'word': 'Einn'},
     {'lemma': '.', 'tag': '.', 'word': '.'}],
    [{'lemma': 'tveir', 'tag': 'tfkfn', 'word': 'Tveir'},
     {'lemma': '.', 'tag': '.', 'word': '.'}]]},
  {'sentences': [[{'lemma': 'þrír', 'tag': 'tfkfn', 'word': 'Þrír'},
     {'lemma': '.', 'tag': '.', 'word': '.'}]]}]}

In [12]:
test = ['One. Two.\n', 'Three.\n']
enrich_corpus(test, 'en', chunksize=2)

Bulk enrichment took=0.00


[(['One', '.', 'Two', '.'], ['CD', '.', 'CD', '.'], ['one', '.', 'two', '.']),
 (['Three', '.'], ['CD', '.'], ['three', '.'])]

In [None]:
def map_to_target(input_file, target_dir):
    input_file = pathlib.Path(input_file)
    return pathlib.Path(target_dir).joinpath(f'{input_file.stem}.{input_file.suffix}.tsv')

enriched_corpus = {lang: {
    map_to_target(corpus_path, target_dir): enrich_corpus(input_corpora[lang][corpus_path], lang, chunksize=4000) for corpus_path in input_corpora[lang]}
for lang in langs}

Bulk enrichment took=0.94
Bulk enrichment took=1.45
Bulk enrichment took=2.56
Bulk enrichment took=2.38
Bulk enrichment took=3.52
Bulk enrichment took=2.11
Bulk enrichment took=2.18
Bulk enrichment took=2.23
Bulk enrichment took=2.63
Bulk enrichment took=2.30
Bulk enrichment took=2.09
Bulk enrichment took=2.09
Bulk enrichment took=1.71
Bulk enrichment took=1.62
Bulk enrichment took=2.26
Bulk enrichment took=2.41
Bulk enrichment took=2.37
Bulk enrichment took=2.33
Bulk enrichment took=1.87
Bulk enrichment took=1.92
Bulk enrichment took=1.89
Bulk enrichment took=3.57
Bulk enrichment took=3.20
Bulk enrichment took=3.52
Bulk enrichment took=3.79
Bulk enrichment took=3.45
Bulk enrichment took=2.61
Bulk enrichment took=3.31
Bulk enrichment took=2.70
Bulk enrichment took=3.23
Bulk enrichment took=3.35
Bulk enrichment took=3.66
Bulk enrichment took=3.41
Bulk enrichment took=3.44
Bulk enrichment took=3.72
Bulk enrichment took=3.06
Bulk enrichment took=3.15
Bulk enrichment took=3.59
Bulk enrichm

Bulk enrichment took=2.47
Bulk enrichment took=2.59
Bulk enrichment took=2.68
Bulk enrichment took=2.55
Bulk enrichment took=2.75
Bulk enrichment took=3.05
Bulk enrichment took=3.54
Bulk enrichment took=3.25
Bulk enrichment took=3.55
Bulk enrichment took=3.21
Bulk enrichment took=3.02
Bulk enrichment took=2.98
Bulk enrichment took=2.70
Bulk enrichment took=2.92
Bulk enrichment took=2.67
Bulk enrichment took=2.59
Bulk enrichment took=2.81
Bulk enrichment took=2.69
Bulk enrichment took=2.41
Bulk enrichment took=2.03
Bulk enrichment took=2.40
Bulk enrichment took=2.98
Bulk enrichment took=2.48
Bulk enrichment took=2.66
Bulk enrichment took=3.21
Bulk enrichment took=3.70
Bulk enrichment took=3.77
Bulk enrichment took=3.16
Bulk enrichment took=3.14
Bulk enrichment took=3.09
Bulk enrichment took=2.73
Bulk enrichment took=2.58
Bulk enrichment took=1.58
Bulk enrichment took=1.38
Bulk enrichment took=1.40
Bulk enrichment took=1.58
Bulk enrichment took=1.38
Bulk enrichment took=1.41
Bulk enrichm

Bulk enrichment took=60.36
Bulk enrichment took=52.39
Bulk enrichment took=54.99
Bulk enrichment took=55.63
Bulk enrichment took=59.02
Bulk enrichment took=53.69
Bulk enrichment took=39.47
Bulk enrichment took=54.40
Bulk enrichment took=55.49
Bulk enrichment took=52.38
Bulk enrichment took=51.11
Bulk enrichment took=58.68
Bulk enrichment took=56.84
Bulk enrichment took=49.37
Bulk enrichment took=61.44
Bulk enrichment took=47.76
Bulk enrichment took=54.27
Bulk enrichment took=48.68
Bulk enrichment took=55.99
Bulk enrichment took=57.38
Bulk enrichment took=59.22
Bulk enrichment took=43.38
Bulk enrichment took=49.17
Bulk enrichment took=55.30
Bulk enrichment took=55.02
Bulk enrichment took=56.05
Bulk enrichment took=61.20
Bulk enrichment took=45.55
Bulk enrichment took=58.33
Bulk enrichment took=47.26
Bulk enrichment took=57.70
Bulk enrichment took=63.21
Bulk enrichment took=56.36
Bulk enrichment took=55.02
Bulk enrichment took=47.75
Bulk enrichment took=49.29
Bulk enrichment took=58.63
B

In [15]:
import pickle

with open(f'{target_dir}/filtered.pickle', 'wb') as handle:
    pickle.dump(fix_enriched_corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
!ls {target_dir}

dev.en		 test-ees.en  test-ema.is	     tmp.pickle
dev.is		 test-ees.is  test-opensubtitles.en  train.en
filtered.pickle  test-ema.en  test-opensubtitles.is  train.is


In [2]:
import pickle
with open(f'{target_dir}/tmp.pickle', 'rb') as handle:
    enriched_corpus = pickle.load(handle)

In [11]:
fix_enriched_corpus = {}
for lang in enriched_corpus.keys():
    fix_enriched_corpus[lang] = {}
    for corpus_path in enriched_corpus[lang]:
        name, _, lang, _ = pathlib.Path(corpus_path).name.split('.')
        fix_enriched_corpus[lang][".".join([name, lang])] = enriched_corpus[lang][corpus_path]
        

In [14]:
for lang in fix_enriched_corpus.keys():
    for corpus_name in fix_enriched_corpus[lang]:
        corpus_name
        fix_enriched_corpus[lang][corpus_name][:2]

'test-opensubtitles.en'

[(['Somebody',
   'threw',
   'a',
   'brick',
   'through',
   'Meyer',
   'Blum',
   "'s",
   'window',
   'last',
   'night',
   '.'],
  ['NN', 'VBD', 'DT', 'NN', 'IN', 'NNP', 'NNP', 'POS', 'NN', 'JJ', 'NN', '.'],
  ['somebody',
   'throw',
   'a',
   'brick',
   'through',
   'meyer',
   'blum',
   "'s",
   'window',
   'last',
   'night',
   '.']),
 (['They', 'ai', "n't", 'gon', 'na', 'be', 'so', 'bad', '.'],
  ['PRP', 'VBP', 'RB', 'VB', 'TO', 'VB', 'RB', 'JJ', '.'],
  ['they', 'ai', "n't", 'gon', 'na', 'be', 'so', 'bad', '.'])]

'test-ees.en'

[(['Whereas',
   'Chapter',
   '6',
   'of',
   'Annex',
   'I',
   'to',
   'Directive',
   '92/118/EEC',
   'lays',
   'down',
   'requirements',
   'for',
   'the',
   'importation',
   'of',
   'processed',
   'animal',
   'protein',
   ';'],
  ['NNP',
   'NNP',
   'CD',
   'IN',
   'NNP',
   'PRP',
   'TO',
   'VB',
   'CD',
   'NNS',
   'RP',
   'NNS',
   'IN',
   'DT',
   'NN',
   'IN',
   'VBN',
   'JJ',
   'NN',
   ':'],
  ['whereas',
   'chapter',
   '6',
   'of',
   'annex',
   'i',
   'to',
   'directive',
   '92/118/eec',
   'lay',
   'down',
   'requirement',
   'for',
   'the',
   'importation',
   'of',
   'process',
   'animal',
   'protein',
   ';']),
 (['Whereas',
   'Commission',
   'Decision',
   '94/278/EC',
   '(',
   '3',
   ')',
   ',',
   'as',
   'last',
   'amended',
   'by',
   'Decision',
   '96/344/EC',
   '(',
   '4',
   ')',
   ',',
   'has',
   'laid',
   'down',
   'a',
   'list',
   'of',
   'third',
   'countries',
   'from',
   'which',
   'Member'

'train.en'

[(['BOOK', 'I'], ['NNP', 'PRP'], ['book', 'i']),
 (['CHAPTER', 'I', 'A', 'FETE', 'AT', 'THE', 'NEW', 'PALACE'],
  ['NN', 'PRP', 'VBP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP'],
  ['chapter', 'i', 'a', 'fete', 'at', 'the', 'new', 'palace'])]

'test-ema.en'

[(['(',
   '2',
   ')',
   'TVC',
   ':',
   'includes',
   'all',
   'vaccinated',
   'subjects',
   '(',
   'who',
   'received',
   'at',
   'least',
   'one',
   'dose',
   'of',
   'vaccine',
   ')',
   'irrespective',
   'of',
   'HPV',
   'DNA',
   'status',
   ',',
   'cytology',
   'and',
   'serostatus',
   'at',
   'baseline',
   '.'],
  ['(',
   'CD',
   ')',
   'NN',
   ':',
   'VBZ',
   'DT',
   'JJ',
   'NNS',
   '(',
   'WP',
   'VBD',
   'IN',
   'JJS',
   'CD',
   'NN',
   'IN',
   'NN',
   ')',
   'NN',
   'IN',
   'NNP',
   'NNP',
   'NN',
   ',',
   'NN',
   'CC',
   'NN',
   'IN',
   'NN',
   '.'],
  ['(',
   '2',
   ')',
   'tvc',
   ':',
   'include',
   'all',
   'vaccinated',
   'subject',
   '(',
   'who',
   'receive',
   'at',
   'least',
   'one',
   'dose',
   'of',
   'vaccine',
   ')',
   'irrespective',
   'of',
   'hpv',
   'dna',
   'status',
   ',',
   'cytology',
   'and',
   'serostatus',
   'at',
   'baseline',
   '.']),
 (['(', 'atazanavir', '30

'dev.en'

[(['Slide', 'over', ',', 'honey', '.'],
  ['NN', 'IN', ',', 'NN', '.'],
  ['slide', 'over', ',', 'honey', '.']),
 (['•',
   'Where',
   'the',
   'Commission',
   'ascertains',
   'that',
   'a',
   'notified',
   'body',
   'does',
   'not',
   'meet',
   'or',
   'no',
   'longer',
   'meets',
   'the',
   'requirements',
   'for',
   'its',
   'notification',
   ',',
   'it',
   'shall',
   'adopt',
   'an',
   'implementing',
   'act',
   'requesting',
   'the',
   'notifying',
   'Member',
   'State',
   'to',
   'take',
   'the',
   'necessary',
   'corrective',
   'measures',
   ',',
   'including',
   'withdrawal',
   'of',
   'notification',
   'if',
   'necessary',
   '.'],
  ['NN',
   'WRB',
   'DT',
   'NNP',
   'VBZ',
   'IN',
   'DT',
   'JJ',
   'NN',
   'VBZ',
   'RB',
   'VB',
   'CC',
   'DT',
   'JJR',
   'VBZ',
   'DT',
   'NNS',
   'IN',
   'PRP$',
   'NN',
   ',',
   'PRP',
   'MD',
   'VB',
   'DT',
   'NN',
   'NN',
   'VBG',
   'DT',
   'JJ',
   'NNP',
   'NNP'

'test-opensubtitles.is'

[(['Einhver',
   'henti',
   'múrsteini',
   'inn',
   'um',
   'glugga',
   'hjá',
   'Meyer',
   'Blum',
   'í',
   'nótt',
   '.'],
  ['foken',
   'svg3eþ',
   'nken',
   'aa',
   'ao',
   'nkfo',
   'aþ',
   'e',
   'e',
   'aþ',
   'nveo',
   '.'],
  ['einhver',
   'henda',
   'múrsteini',
   'inn',
   'um',
   'gluggi',
   'hjá',
   'Meyer',
   'Blum',
   'í',
   'nótt',
   '.']),
 (['Þeir', 'standa', 'sig', '.'],
  ['fpkfn', 'sfg3fn', 'fpkfo', '.'],
  ['hann', 'standa', 'sig', '.'])]

'test-ees.is'

[(['Í',
   '6.',
   'kafla',
   'I.',
   'viðauka',
   'við',
   'tilskipun',
   '92',
   '/',
   '118',
   '/',
   'EBE',
   'er',
   'mælt',
   'fyrir',
   'um',
   'kröfur',
   'vegna',
   'innflutnings',
   'á',
   'unnu',
   'dýraprótíni',
   '.'],
  ['aþ',
   'ta',
   'nkeþ',
   'ta',
   'nkee',
   'ao',
   'nveo',
   'ta',
   '/',
   'ta',
   '/',
   'as',
   'sfg3en',
   'sþghen',
   'aa',
   'ao',
   'nvfo',
   'ae',
   'nkee',
   'aa',
   'sfg3fþ',
   'nheþ',
   '.'],
  ['í',
   '6.',
   'kafli',
   'I.',
   'viðauki',
   'við',
   'tilskipun',
   '92',
   '/',
   '118',
   '/',
   'EBE',
   'vera',
   'mæla',
   'fyrir',
   'um',
   'krafa',
   'vegna',
   'innflutningur',
   'á',
   'vinna',
   'dýraprótín',
   '.']),
 (['Í',
   'ákvörðun',
   'framkvæmdastjórnarinnar',
   '94',
   '/',
   '278',
   '/',
   'EB',
   '(',
   '3',
   ')',
   ',',
   'eins',
   'og',
   'henni',
   'var',
   'síðast',
   'breytt',
   'með',
   'ákvörðun',
   '96',
   '/',
   '344',
   '/',
   

'test-ema.is'

[(['(',
   '2',
   ')',
   'Heildarþýði',
   'bólusettra',
   ':',
   'í',
   'hópnum',
   'voru',
   'allir',
   'sem',
   'voru',
   'bólusettir',
   '(',
   'fengu',
   'a.m.k.',
   'einn',
   'skammt',
   'af',
   'bóluefninu',
   ')',
   ',',
   'óháð',
   'HPV-DNA-stöðu',
   'þeirra',
   ',',
   'frumufræðilegum',
   'greiningum',
   'og',
   'mótefnastöðu',
   'í',
   'upphafi',
   '.'],
  ['(',
   'ta',
   ')',
   'nhen',
   'lkfesf',
   ':',
   'aþ',
   'nkeþg',
   'sfg3fþ',
   'fokfn',
   'ct',
   'sfg3fþ',
   'sþgkfn',
   '(',
   'sfg3fþ',
   'as',
   'tfkeo',
   'aa',
   'aþ',
   'nheþg',
   ')',
   ',',
   'aþ',
   'nveþ-s',
   'fpvfe',
   ',',
   'lvfþsf',
   'nvfþ',
   'c',
   'nveþ',
   'aþ',
   'nheþ',
   '.'],
  ['(',
   '2',
   ')',
   'heildarþýði',
   'bólusettur',
   ':',
   'í',
   'hópur',
   'vera',
   'allur',
   'sem',
   'vera',
   'bólusetja',
   '(',
   'fá',
   'a.m.k.',
   'einn',
   'skammt',
   'af',
   'bóluefni',
   ')',
   ',',
   'óháð',
   'HPV-DN

'train.is'

[(['FYRRI', 'ÞÁTTUR'], ['e', 'nken'], ['FYRRI', 'þáttur']),
 (['I.', 'Veizlan', 'í', 'nýju', 'höllinni', '.'],
  ['ta', 'nveng', 'aþ', 'lveþvf', 'nveþg', '.'],
  ['I.', 'veizla', 'í', 'nýr', 'höll', '.'])]

'dev.is'

[(['Færðu', 'þig', ',', 'ástin', '.'],
  ['sfg2en', 'fp2eo', ',', 'nveng', '.'],
  ['fá', 'þú', ',', 'ást', '.']),
 (['•',
   'Ef',
   'framkvæmdastjórnin',
   'kemst',
   'að',
   'raun',
   'um',
   'að',
   'tilkynnt',
   'stofa',
   'uppfylli',
   'ekki',
   ',',
   'eða',
   'uppfylli',
   'ekki',
   'lengur',
   ',',
   'kröfurnar',
   'viðvíkjandi',
   'tilkynningu',
   'hennar',
   ',',
   'skal',
   'hún',
   'samþykkja',
   'framkvæmdargerð',
   'þar',
   'sem',
   'óskað',
   'er',
   'eftir',
   'því',
   'að',
   'tilkynningaraðildarríkið',
   'geri',
   'nauðsynlegar',
   'ráðstafanir',
   'til',
   'úrbóta',
   ',',
   'þ.m.t.',
   'að',
   'afturkalla',
   'tilkynningu',
   ',',
   'ef',
   'nauðsyn',
   'krefur',
   '.'],
  ['as',
   'c',
   'nveng',
   'sfm3en',
   'aþ',
   'nveþ',
   'ao',
   'c',
   'sþgven',
   'nven',
   'svg3en',
   'aa',
   ',',
   'c',
   'svg3en',
   'aa',
   'aam',
   ',',
   'nvfng',
   'slg',
   'nveo',
   'fpvee',
   ',',
   'sfg3en',
   '