In [15]:
import re
import json
import time
from tokenizer import split_into_sentences
import pos
import time

In [2]:
import number_help as nh
import number_functions as nf
import wlinks 

In [3]:
import area_dict as ad
import currency_dict as cd
import distance_dict as dd
import electronic_dict as ed
import period_dict as pd
import rest_dict as rd
import time_dict as td
import volume_dict as vd
import weight_dict as wd
import pre_help_dicts as phd
import symbols_dict as sd

In [4]:
import cardinal_ones_tuples as cot
import cardinal_thousands_tuples as ctt
import cardinal_million_tuples as cmt
import cardinal_big_tuples as cbt
import ordinal_ones_tuples as oot
import ordinal_thousands_tuples as ott
import ordinal_million_tuples as omt
import ordinal_big_tuples as obt
import decimal_thousands_tuples as dtt
import fraction_tuples as ft
import sport_tuples as st
import time_tuples as tt
import abbr_functions as af

In [5]:
tagger = pos.Tagger(
    model_file="tagger-v2.0.0.pt",
    device="cpu",
)

In [6]:
abbr_dict = json.load(open("abbrdict.txt"))
direction_ptrn = "[SN]?V|N|[SN]?A|S"
direction_dict = json.load(open("directiondict.txt"))
denominator_dict = json.load(open("denominatordict.txt"))

In [7]:
pre_help_dict = phd.pre_help_dicts
area_dict = ad.make_area_dict()
currency_dict = cd.make_currency_dict()
distance_dict = dd.make_distance_dict()
electronic_dict = ed.make_electronic_dict()
period_dict = pd.make_period_dict()
rest_dict = rd.make_rest_measure_dict()
time_dict = td.make_time_dict()
volume_dict = vd.make_volume_dict()
weight_dict = wd.make_weight_dict()
symb_dict = sd.symb_dict

In [8]:
cardinal_thousand_tuples = cot.cardinal_ones_tuples + ctt.cardinal_thousands_tuples
cardinal_million_tuples = cardinal_thousand_tuples + cmt.cardinal_million_tuples
cardinal_big_tuples = cardinal_million_tuples + cbt.cardinal_big_tuples

ordinal_thousand_tuples = oot.ordinal_ones_tuples + ott.ordinal_thousands_tuples + ctt.cardinal_thousands_tuples
ordinal_million_tuples = ordinal_thousand_tuples + cmt.cardinal_million_tuples + omt.ordinal_million_tuples
ordinal_big_tuples = ordinal_million_tuples + cbt.cardinal_big_tuples + obt.ordinal_big_tuples

decimal_thousand_tuples = cardinal_thousand_tuples + dtt.decimal_thousands_tuples

decimal_big_tuples = cardinal_big_tuples + dtt.decimal_thousands_tuples

fraction_tuples = cardinal_thousand_tuples + ft.fraction_tuples

sport_tuples = st.sport_tuples

time_tuples = tt.time_tuples

In [9]:
grein = "Auknar takmarkanir hefjast á miðnætti í París og víðar í Frakklandi vegna ótta við þriðju bylgju " + \
        "kórónuveirunnar. Aðgerðirnar hafa áhrif á um 21 milljón manns sem búa á 16 svæðum í landinu. Þær " + \
        "verða ekki ekki eins strangar og áður. Að sögn forsætisráðherrans Jean Castex fær fólk t.a.m. " + \
        "að stunda æfingar utandyra, að því er kemur fram á BBC. Þetta var 4. dagur 120. mánaðar, " + \
        "300. árs. Á ég 4 krónur, 40.000 krónur eða 234.000.000 krónur? Áttu 1/2 mínútu? Klukkan er " + \
        "10:08, klukkan var 07 áðan. Hann átti 23/23 fráköst. Það er hneyksli að KFUM sé ekki búið að skipta " + \
        "sér af. Kristján IX. kom til landsins um daginn, bíðum eftir Kristjáni IXIIXIXVIX. Létt & laggott seldi " + \
        "hlut sinn þann 02.03.2021. Það er hægt að lesa um það á https://mbl.is/innlent. Leikurinn tók 3-4 tíma. " + \
        "Síminn minn er 867 9086."

sent_grein = list(split_into_sentences(grein))

In [10]:
def replace_abbreviations(sent)
    sent = af.replace_all(sent, pre_help_dict)
    sent = af.replace_all(sent, abbr_dict)
    sent = af.replace_all(sent, direction_dict, direction_ptrn)
    sent = af.replace_all(sent, denominator_dict, "\/")
    sent = af.replace_all(sent, weight_dict, wd.weight_ptrn)
    sent = af.replace_all(sent, distance_dict, dd.distance_ptrn)
    sent = af.replace_all(sent, area_dict, ad.area_ptrn)
    sent = af.replace_all(sent, volume_dict, vd.volume_ptrn)
    sent = af.replace_all(sent, time_dict, td.time_ptrn)
    sent = af.replace_all(sent, currency_dict, cd.currency_ptrn)
    sent = af.replace_all(sent, electronic_dict, ed.electronic_ptrn)
    sent = af.replace_all(sent, rest_dict, rd.rest_ptrn) 
    sent = af.replace_all(sent, period_dict, pd.period_ptrn)
    sent = af.replace_domain(sent.split(), 'other')
    return sent

0.05456900596618652
0.42977094650268555
0.040628910064697266
0.009467124938964844
0.007036924362182617
0.0011610984802246094
0.007748842239379883
0.0009388923645019531
0.0008440017700195312
0.0008668899536132812
0.0019741058349609375
0.0010471343994140625
0.001196146011352539
0.0007939338684082031
0.0008399486541748047


In [11]:
def handle_sentence(sent):
    returnsent = ""
    sentsplit = sent.split()
    tagsent = tagger.tag_sent(sentsplit)
    split_zip = list(zip(sentsplit, list(tagsent[1:]) + [""]))
    for word, nexttag in split_zip:
        if re.match("\d", word):
            word = number_findall(word, nexttag)
        if re.match(nh.roman_letters_ptrn, word):
            word = " ".join(word)
        elif re.match(nh.letters_ptrn, word):
            word = " ".join(word)
        elif re.match(wlinks.link_ptrn_all, word):
            word = wlinks.wlink_fun(word)
        elif re.match(nh.symbol_ptrn, word):
            word = af.replace_all(word, symb_dict, nh.symbol_ptrn)
        returnsent += word + " "
    return returnsent

In [12]:
 def number_findall(word, tag):
    normalized_str = ""
    if re.findall(nh.ordinal_thousand_ptrn, word):
        ordinal_thousand_dict = nf.make_dict(word, nh.int_cols_thousand)
        tmpword = nf.fill_dict(word, tag, ordinal_thousand_tuples, ordinal_thousand_dict, nh.int_cols_thousand)

    elif re.findall(nh.ordinal_million_ptrn, word):
        ordinal_million_dict = nf.make_dict(word, nh.int_cols_million)
        tmpword = nf.fill_dict(word, tag, ordinal_million_tuples, ordinal_million_dict, nh.int_cols_million)

    elif re.findall(nh.ordinal_big_ptrn, word):
        ordinal_big_dict = nf.make_dict(word, nh.int_cols_big)
        tmpword = nf.fill_dict(word, tag, ordinal_big_tuples, ordinal_big_dict, nh.int_cols_big)

    elif re.findall(nh.cardinal_thousand_ptrn, word):
        cardinal_thousand_dict = nf.make_dict(word, nh.int_cols_thousand)
        tmpword = nf.fill_dict(word, tag, cardinal_thousand_tuples, cardinal_thousand_dict, nh.int_cols_thousand)

    elif re.findall(nh.cardinal_million_ptrn, word):
        cardinal_million_dict = nf.make_dict(word, nh.int_cols_million)
        tmpword = nf.fill_dict(word, tag, cardinal_million_tuples, cardinal_million_dict, nh.int_cols_million)

    elif re.findall(nh.cardinal_big_ptrn, word):
        cardinal_big_dict = nf.make_dict(word, nh.int_cols_big)
        tmpword = nf.fill_dict(word, tag, cardinal_big_tuples, cardinal_big_dict, nh.int_cols_big)

    elif re.findall(nh.decimal_thousand_ptrn, word):
        decimal_thousand_dict = nf.make_dict(word, nh.decimal_cols_thousand)
        tmpword = nf.fill_dict(word, tag, decimal_thousand_tuples, decimal_thousand_dict, nh.decimal_cols_thousand)

    elif re.findall(nh.decimal_big_ptrn, word):
        decimal_big_dict = nf.make_dict(word, nh.decimal_cols_big)
        tmpword = nf.fill_dict(word, tag, decimal_big_tuples, decimal_big_dict, nh.decimal_cols_big)

    elif re.findall(nh.fraction_ptrn, word):
        fraction_dict = nf.make_dict(word, nh.decimal_cols_thousand)
        tmpword = nf.fill_dict(word, tag, fraction_tuples, fraction_dict, nh.decimal_cols_thousand)

    elif re.findall(nh.time_ptrn, word):
        time_dict = nf.make_dict(word, nh.time_sport_cols)
        tmpword = nf.fill_dict(word, tag, time_tuples, time_dict, nh.time_sport_cols)

    elif re.findall(nh.sport_ptrn, word):
        sport_dict = nf.make_dict(word, nh.time_sport_cols)
        tmpword = nf.fill_dict(word, tag, sport_tuples, sport_dict, nh.time_sport_cols)

    elif re.findall("^0\d\.$", word):
        tmpword = nf.digit_ord_fun(word)

    else:
        tmpword = nf.digit_fun(word)
    word = tmpword
    return word

In [13]:
sent_expand_no = []
for sent in sent_expand_abbr:
    start = time.time()
    sent = handle_sentence(sent)
    print(time.time() - start)
    sent_expand_no.append(sent)

0.09571099281311035
0.13285398483276367
0.06630396842956543
0.08393383026123047
0.09348607063293457
0.313190221786499
0.1851646900177002
0.08957314491271973
0.12955904006958008
0.06879210472106934
0.06743884086608887
0.06951785087585449
0.06721377372741699
0.06266593933105469
0.06692194938659668


In [14]:
sent_expand_no

['Auknar takmarkanir hefjast á miðnætti í París og víðar í Frakklandi vegna ótta við þriðju bylgju kórónuveirunnar . ',
 'Aðgerðirnar hafa áhrif á um  tuttugu og eina milljón manns sem búa á  sextán svæðum í landinu . ',
 'Þær verða ekki ekki eins strangar og áður . ',
 'Að sögn forsætisráðherrans Jean Castex fær fólk til að mynda að stunda æfingar utandyra , að því er kemur fram á B B C . ',
 'Þetta var  fjórði dagur hundrað og tuttugasta mánaðar ,  þrjú hundruðasta árs . ',
 'Á ég  fjórar krónur ,  fjörutíu þúsund krónur eða  tvö hundruð þrjátíu og fjórar milljónir krónur ? ',
 'Áttu  hálfa mínútu ? ',
 'Klukkan er  tíu núll átta , klukkan var núll  sjö áðan . ',
 'Hann átti  tuttugu og þrjú skástrik  tuttugu og þrjú fráköst . ',
 'Það er hneyksli að K F U M sé ekki búið að skipta sér af . ',
 'Kristján níundi kom til landsins um daginn , bíðum eftir Kristjáni I X I I X I X V I X . ',
 'Létt og laggott seldi hlut sinn þann núll annan núll þriðja  tvö þúsund tuttugu og eitt . ',
 'Það