In [1]:
import warnings
warnings.filterwarnings("ignore")
import regex as re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
from tokenizer import split_into_sentences
import time
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
df_sport = pd.read_csv('sport_df_tagged_0503.csv')

In [2]:
df_other = pd.read_csv('other_df_tagged_0302.csv', keep_default_na=False, na_values=['_'])

In [3]:
ordinal_ptrn = "^(([1-9]\d{0,2}\.)(\d{3}\.)*\d{3}|[1-9]\d*)\.$"
cardinal_ptrn = "^((([1-9]\d{0,2}\.)(\d{3}\.)*\d{3}|[1-9]\d*)|0)((,\d+)?|( [1-9]\d*\/[1-9]\d*| ?(½|⅓|⅔|¼|¾)))$"
fraction_ptrn = "^([1-9]\d*\/([2-9]|[1-9]\d+)|(½|⅓|⅔|¼|¾))$"
time_ptrn = "^([01]?\d|2[0-4])[:\.][0-5]\d$"
time_digit_ptrn = "^0\d$"
sprt_ptrn = "^(?!1\/2)([1-9]\d?\/[1-9]\d?)$"
letters_ptrn = r"^(?!^(RÚV|SPRON|\-|\.)$)[\-\.A-ZÁÐÉÍÓÚÝÞÆÖ]{1,5}$"
roman_letters_ptrn = r"[IVXLCDM]{5,20}"
all_numbers_ptrn = ordinal_ptrn + "|" + cardinal_ptrn + "|" + fraction_ptrn + "|" + time_ptrn + "|" + time_digit_ptrn + "|" + roman_letters_ptrn

In [4]:
def make_type(df, domain):
    start = time.time()
    df.loc[(df['after_measure'].str.match(ordinal_ptrn)), 'type'] = "ordinal"
    df.loc[(df['after_measure'].str.match(cardinal_ptrn)), 'type'] = "cardinal"
    df.loc[(df['after_measure'].str.match(time_ptrn)), 'type'] = "time"
    df.loc[(df['after_measure'].str.match(time_digit_ptrn)), 'type'] = "timedigit"
    df.loc[(df['after_measure'].str.match(fraction_ptrn)), 'type'] = "fraction"
    if domain == 'sport':
        df.loc[(df['after_measure'].str.match(sprt_ptrn)), 'type'] = "sport"
    elif 'other':
        pass
    df.loc[(df['after_measure'].str.match(letters_ptrn)), 'type'] = "letters"
    df.loc[(df['after_measure'].str.match(roman_letters_ptrn)), 'type'] = "letters"
    df.loc[(df['after_measure'].str.match("^0\d\.$")), 'type'] = "digitzero"
    df.loc[((df['after_measure'].str.match("^\d{7}$")) & (df['type'] == "cardinal")), 'type'] = "digit"
    df.loc[((df['after_measure'].str.match("\d")) & (df['type'] == "")), 'type'] = "digit"
    print(f"make_type done in {time.time() - start}")

In [5]:
make_type(df_sport, 'sport')

make_type done in 2.169097900390625


In [5]:
make_type(df_other, 'other')

make_type done in 112.44260120391846


In [6]:
df_sport['tag'] = df_sport['next_tag']
df_sport['word'] = df_sport['after_measure']

In [6]:
df_other['tag'] = df_other['next_tag']
df_other['word'] = df_other['after_measure']

In [171]:
df_sample['tag'] = df_sample['next_tag']
df_sample['word'] = df_sample['after_measure']

In [7]:
df_sport_run = df_sport[['sentence_id', 'token_id', 'word', 'tag', 'type']]

In [7]:
df_other_run = df_other[['sentence_id', 'token_id', 'word', 'tag', 'type']]

In [172]:
df_sample_run = df_sample[['sentence_id', 'token_id', 'word', 'tag', 'type']]

# GRAMMAR

In [8]:
hálfur = "[nl]ke[n]-?((g?s?)|([svo]?[fme]?))"
hálfan = "[nl]ke[o]-?((g?s?)|([svo]?[fme]?))"
hálfum = "[nl][kvh][ef]þ-?((g?s?)|([svo]?[fme]?))"
hálfs = "[nl][kh]ee-?((g?s?)|([svo]?[fme]?))"
hálf = "(([nl]ven)|([n|l]hf[n|o]))-?((g?s?)|([svo]?[fme]?))"
hálfa = "[nl][k|v][f|e]o-?((g?s?)|([svo]?[fme]?))"
hálfri = "[nl]veþ-?((g?s?)|([svo]?[fme]?))"
hálfrar = "[nl]vee-?((g?s?)|([svo]?[fme]?))"
hálft = "[nl]he[n|o]-?((g?s?)|([svo]?[fme]?))"
hálfu = "[nl]heþ-?((g?s?)|([svo]?[fme]?))"
hálfir = "[nl]kfn-?((g?s?)|([svo]?[fme]?))"
hálfa = "[nl][k|v][f|e]o-?((g?s?)|([svo]?[fme]?))"
hálfra = "[nl][k|v|h]fe-?((g?s?)|([svo]?[fme]?))"
hálfar = "[nl]vf[n|o]-?((g?s?)|([svo]?[fme]?))"

einn = "[nl]k[ef][no]-?((g?s?)|([svo]?[fme]?))"
einum = "[nl]k[ef]þ-?((g?s?)|([svo]?[fme]?))"
eins = "[nl][kh][ef]e-?((g?s?)|([svo]?[fme]?))"
ein = "(([nl]v[ef]n)|([n|l]hf[n|o]))-?((g?s?)|([svo]?[fme]?))"
eina = "[nl]v[ef]o-?((g?s?)|([svo]?[fme]?))"
einni = "[nl]v[ef]þ-?((g?s?)|([svo]?[fme]?))"
einnar = "[nl]v[ef]e-?((g?s?)|([svo]?[fme]?))"
eitt = "[nl]h[ef][no]-?((g?s?)|([svo]?[fme]?))"
einu = "[nl]h[ef]þ-?((g?s?)|([svo]?[fme]?))"

tveir = "[nl]k[ef]n-?((g?s?)|([svo]?[fme]?))"
tvo = "[nl]k[ef]o-?((g?s?)|([svo]?[fme]?))"
tveimur = "[nl][kvh][ef]þ-?((g?s?)|([svo]?[fme]?))"
tveggja = "[nl][kvh][ef]e-?((g?s?)|([svo]?[fme]?))"
tvær = "[nl]v[ef][no]-?((g?s?)|([svo]?[fme]?))"
tvö = "[nl]h[ef][no]-?((g?s?)|([svo]?[fme]?))"

þrír = "[nl]k[ef]n-?((g?s?)|([svo]?[fme]?))"
þrjá = "[nl]k[ef]o-?((g?s?)|([svo]?[fme]?))"
þremur = "[nl][kvh][ef]þ-?((g?s?)|([svo]?[fme]?))"
þriggja = "[nl][kvh][ef]e-?((g?s?)|([svo]?[fme]?))"
þrjár = "[nl]v[ef][no]-?((g?s?)|([svo]?[fme]?))"
þrjú = "[nl]h[ef][no]-?((g?s?)|([svo]?[fme]?))"

fjórir = "[nl]k[ef]n-?((g?s?)|([svo]?[fme]?))"
fjóra = "[nl]k[ef]o-?((g?s?)|([svo]?[fme]?))"
fjórum = "[nl][kvh][ef]þ-?((g?s?)|([svo]?[fme]?))"
fjögurra = "[nl][kvh][ef]e-?((g?s?)|([svo]?[fme]?))"
fjórar = "[nl]v[ef][no]-?((g?s?)|([svo]?[fme]?))"
fjögur = "[nl]h[ef][no]-?((g?s?)|([svo]?[fme]?))"

nonoun = "^(?![nl][kvh][ef][noþe]-?((g?s?)|([svo]?[fme]?)))[a-záðéíóúýþæö\d\-]+$"

## REGEX RULES


In [9]:
zeropnt_ptrn = "^(([1-9]((\d{0,2}(\.\d{3})*\.)\d{3}))|\d+|0),"
tns_ptrn = "^((([1-9]((\d{0,2}(\.\d{3})*\.)|\d*))\d)|[1-9])?"
dec_ptrn = "((,\d*)|(\s1\/2|\s?½))?$"
dec_ptrn_def = "(,\d*)|(\s1\/2|\s?½)$"
dec_ptrn_ordinal = "(\.|(,\d*)|(\s1\/2|\s?½))?$"

ones_ptrn_no11 = "^((([1-9]((\d{0,2}(\.\d{3})*\.\d)|\d*))[02-9])|[2-9]?)?"
hndrds_ptrn_no11 = "^([1-9]((\d{0,2}(\.\d{3})*\.)|\d*))?"
hndrds_ptrn_no11_def = "^([1-9]((\d{0,2}(\.\d{3})*\.)|\d*))"

ones_ptrn_11 = "^([1-9]((\d{0,2}(\.\d{3})*\.\d{2}|\d*))|[1-9]?)?"
hndrds_ptrn_11 = "^((([1-9]((\d{0,2}(\.\d{3})*\.)|\d*)))|[1-9]?)"

half_ptrn = "^(([1-9]((\d{0,2}(\.\d{3})*))|\d+))\s?(1\/2|½)$"
fraction_ptrn_before = "^(([1-9]((\d{0,2}(\.\d{3})*))|\d+))\s"
thsnds_ptrn_after = "(\.?(000|(0[2-9][1-9])|([1-9](?!00)\d{2})))"
thsnds_and_ptrn_cardinal = "\.?(0([01][1-9]|[1-9]0)|[1-9]00)"
thsnds_and_ptrn_ordinal = "\.?(0[2-9]0|[1-9]00)\.$"

hndrds_ptrn_after = "(0([01][1-9]|[2-9]0))|([1-9]00)"
hndrd_thsnd_after = "([2-9][1-9])(\.?\d{3})"
hndrd_and_thsnd = "(([01][1-9]\.?\d{3})|([2-9]0\.?\d{3}))"

million_and_cardinal = "\.(([1-9]00\.000)|(0[1-9]0\.000)|(00[1-9]\.000)|(000\.[1-9]00)|(000\.0([01][1-9]|[2-9]0)))"
million_and_ordinal = "\.(([1-9]00\.000)|(0[1-9]0\.000)|(00[1-9]\.000)|(000\.[1-9]00)|(000\.0[2-9]0))\.$"
milln_ptrn_after = "\.((000\.000)|([1-9](?!00\.000)\d{2}\.\d{3})|(0[1-9](?!0\.000)\d\.\d{3})|(00[1-9]\.(?!0{3})\d{3})|(0{3}\.0[2-9][1-9]))"
hndrd_and_million = "([01][1-9]|[2-9]0)(\.\d{3}){2}"
hndrd_million = "([2-9][1-9])(\.\d{3}){2}"

billion_and_cardinal = "\.((([1-9]00)|(0[1-9]0)|(00[1-9]))(\.0{3}){2}|(0{3}\.([1-9]00|0[1-9]0|00[1-9])\.0{3})|((0{3}\.){2}([1-9]00|0[1-9]0|0[01][1-9])))"
billion_and_ordinal = "\.((([1-9]00)|(0[1-9]0)|(00[1-9]))(\.0{3}){2}|(0{3}\.([1-9]00|0[1-9]0|00[1-9])\.0{3})|((0{3}\.){2}([1-9]00|0[1-9]0)|10))\.$"
billion1 = "([1-9](?!00(\.000){2})\d{2}(\.\d{3}){2})"
billion2 = "(0[1-9](?!0(\.000){2})\d(\.\d{3}){2})"
billion3 = "(00[1-9](?!(\.000){2})(\.\d{3}){2})"
billion4 = "(0{3}\.[1-9](?!00\.000)\d{2}\.\d{3})"
billion5 = "(0{3}\.0[1-9](?!0\.000)\d\.\d{3})"
billion6 = "(0{3}\.00[1-9]\.(?!000)\d{3})"
billion7 = "((0{3}\.){2}[1-9](?!00)\d{2})"
billion8 = "((0{3}\.){2}0[2-9][1-9])"

billion_after = "\.((000(\.0{3}){2})|" + billion1 + "|" + billion2 + "|" + billion3 + "|" + billion4 + "|" + billion5 + "|" + billion6 + "|" + billion7 + "|" + billion8 + ")"

hndrd_and_billion = "([01][1-9]|[2-9]0)(\.\d{3}){3}"
hndrd_billion = "([2-9][1-9])(\.\d{3}){3}"

## RULES FOR CARDINAL ONES

In [10]:
def replace_zero_cardinal(df):
    print("replace_zero_cardinal")
    df.loc[df['word'].str.match("^0(,\d+)?$"), 'ones'] = 'núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "0\d*$")) & (df['type'] == "cardinal")), 'points'] = ' komma núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d0\d*$")) & (df['type'] == "cardinal")), 'point2'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{2}0\d*$")) & (df['type'] == "cardinal")), 'point3'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{3}0\d*$")) & (df['type'] == "cardinal")), 'point4'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{4}0\d*$")) & (df['type'] == "cardinal")), 'point5'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{5}0\d*$")) & (df['type'] == "cardinal")), 'point6'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{6}0\d*$")) & (df['type'] == "cardinal")), 'point7'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{7}0\d*$")) & (df['type'] == "cardinal")), 'point8'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{8}0\d*$")) & (df['type'] == "cardinal")), 'point9'] = ' núll'
    df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{9}0\d*$")) & (df['type'] == "cardinal")), 'point10'] = ' núll'

ones_zip = [(einn, ' einn', "1"),
            (einum, ' einum', "1"), 
            (eins, ' eins', "1"),
            (ein, ' ein', "1"), 
            (eina, ' eina', "1"), 
            (einni, ' einni', "1"), 
            (einnar, ' einnar', "1"),
            (eitt, ' eitt', "1"),
            (einu, ' einu', "1"),
            (tveir, ' tveir', "2"),
            (tvo, ' tvo', "2"),
            (tveimur, ' tveimur', "2"),
            (tveggja, ' tveggja', "2"),
            (tvær, ' tvær', "2"),
            (tvö, ' tvö', "2"),
            (þrír, ' þrír', "3"),
            (þrjá, ' þrjá', "3"),
            (þremur, ' þremur', "3"),
            (þriggja, ' þriggja', "3"),
            (þrjár, ' þrjár', "3"),
            (þrjú, ' þrjú', "3"),
            (fjórir, ' fjórir', "4"),
            (fjóra, ' fjóra', "4"),
            (fjórum, ' fjórum', "4"),
            (fjögurra, ' fjögurra', "4"),
            (fjórar, ' fjórar', "4"),
            (fjögur, ' fjögur', "4")]


def replace_ones_cardinal(df):
    print("replace_ones_cardinal")
    for rule, string, number in tqdm(ones_zip):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + dec_ptrn)) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'ones'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'points'] = ' komma' + string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point2'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{2}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point3'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{3}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point4'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{4}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point5'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{5}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point6'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{6}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point7'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{7}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point8'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{8}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point9'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{9}" + number + "\d*$")) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal")), 'point10'] = string


dec_ones_male = [(' einn', '1'), (' tveir', '2'), (' þrír', '3'), (' fjórir', '4')]
def replace_no_rule_cardinal_decimal(df):
    print("replace_no_rule_cardinal_decimal")
    for string, number in tqdm(dec_ones_male):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + dec_ptrn_def)) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'ones'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'points'] = ' komma' + string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point2'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{2}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point3'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{3}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point4'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{4}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point5'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{5}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point6'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{6}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point7'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{7}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point8'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{8}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point9'] = string
        df.loc[((df['word'].str.match(zeropnt_ptrn + "\d{9}" + number + "\d*$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'point10'] = string
  

dec_ones_neutral = [(' eitt', '1'), (' tvö', '2'), (' þrjú', '3'), (' fjögur', '4')]
def replace_no_rule_cardinal(df):
    print("replace_no_rule_cardinal")
    for string, number in tqdm(dec_ones_neutral):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + "$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'cardinal')), 'ones'] = string
    
other_numbers_zip = [(' fimm', '5'),
                     (' sex', '6'),
                     (' sjö', '7'),
                     (' átta', '8'),
                     (' níu', '9')]

def replace_other_ones_cardinal(df):
    print("replace_other_ones_cardinal")
    for string, number in tqdm(other_numbers_zip):
        df.loc[(df['word'].str.match(ones_ptrn_no11 + number + dec_ptrn) & (df['type'] == 'cardinal')), 'ones'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + number + "\d*$") & (df['type'] == 'cardinal')), 'points'] = ' komma ' + string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d" + number + "\d*$") & (df['type'] == 'cardinal')), 'point2'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{2}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point3'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{3}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point4'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{4}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point5'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{5}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point6'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{6}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point7'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{7}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point8'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{8}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point9'] = string
        df.loc[(df['word'].str.match(zeropnt_ptrn + "\d{9}" + number + "\d*$") & (df['type'] == 'cardinal')), 'point10'] = string
    
tens_zip = [(' tíu', '10'), 
            (' ellefu', '11'),
            (' tólf', '12'),
            (' þrettán', '13'), 
            (' fjórtán', '14'),
            (' fimmtán', '15'),
            (' sextán', '16'),
            (' sautján', '17'),
            (' átján', '18'),
            (' nítján', '19')]

def replace_tens_cardinal(df):
    print("replace_tens_cardinal")
    for string, number in tqdm(tens_zip):
        df.loc[((df['word'].str.match(tns_ptrn + number + dec_ptrn))  & (df['type'] == 'cardinal')), 'dozens'] = string
        

In [11]:
ones_numerator = [(' einn', '1'), (' tveir', '2'), (' þrír', '3'), (' fjórir', '4'), (' fimm', '5'), 
                  (' sex', '6'), (' sjö', '7'), (' átta', '8'), (' níu', '9')]

tens_numerator = [(' tíu', '10'), (' ellefu', '11'), (' tólf', '12'), (' þrettán', '13'), (' fjórtán', '14'), 
                   (' fimmtán', '15'), (' sextán', '16'), (' sautján', '17'), (' átján', '18'), (' nítján', '19')]

dozens_numerator = [(' tuttugu', '2'), (' þrjátíu', '3'), (' fjörutíu', '4'), (' fimmtíu', '5'), (' sextíu', '6'),
                    (' sjötíu', '7'), (' áttatíu', '8'), (' níutíu', '9')]

ones_denominator = [(' aðrir', '2'), (' þriðju', '3'), (' fjórðu', '4'), (' fimmtu', '5'), 
                    (' sjöttu', '6'), (' sjöundu', '7'), (' áttundu', '8'), (' níundu', '9')]

tens_denominator = [(' tíundu', '10'), (' elleftu', '11'), (' tólftu', '12'), (' þrettándu', '13'), 
                    (' fjórtándu', '14'), (' fimmtándu', '15'), (' sextándu', '16'), (' sautjándu', '17'),
                    (' átjándu', '18'), (' nítjándu', '19')]

dozens_denominator = [(' tuttugustu', '2'), (' þrítugustu', '3'), (' fertugustu', '4'), (' fimmtugustu', '5'), 
                    (' sextugustu', '6'), (' sjötugustu', '7'), (' áttugustu', '8'), (' nítugustu', '9')]

def replace_numerator_fractions(df):
    print("replace_numerator_fractions")
    for word_num, no_num in tqdm(ones_numerator):
        for word_doz_num, no_doz_num in dozens_numerator:
            df.loc[((df['word'].str.match(fraction_ptrn_before + no_num + "\/\d+$")) & (df['type'] == "cardinal")), 'points'] = ' og' + word_num
            df.loc[((df['word'].str.match("^" + no_num + "\/\d+$")) & (df['type'] == "fraction")), 'points'] = word_num
            df.loc[((df['word'].str.match(fraction_ptrn_before + no_doz_num + no_num + "\/\d+$")) & (df['type'] == "cardinal")), 'points'] = ' og' + word_doz_num + ' og' + word_num
            df.loc[((df['word'].str.match("^" + no_doz_num + no_num + "\/\d+$")) & (df['type'] == "fraction")), 'points'] = word_doz_num + ' og' + word_num         
    
def replace_tens_fractions(df):
    print("replace_tens_fractions")
    for word_num, no_num in tqdm(tens_numerator):
        for word_den, no_den in tens_denominator:
            df.loc[((df['word'].str.match(fraction_ptrn_before + no_num + "\/\d+$")) & (df['type'] == "cardinal")), 'points'] = ' og' + word_num
            df.loc[((df['word'].str.match("^" + no_num + "\/\d+$")) & (df['type'] == "fraction")), 'points'] = word_num
            df.loc[((df['word'].str.match(fraction_ptrn_before + "\d+\/" + no_den + "$")) & (df['type'] == "cardinal")), 'point2'] = ' og' + word_den
            df.loc[((df['word'].str.match("^\d+\/" + no_den + "$")) & (df['type'] == "fraction")), 'point2'] = word_den

def replace_denominator_fractions(df):
    print("replace_denominator_fractions")
    for word_den, no_den in tqdm(ones_denominator):
        for word_doz_den, no_doz_den in dozens_denominator:
            df.loc[((df['word'].str.match(fraction_ptrn_before + "\d+\/" + no_den + "$")) & (df['type'] == "cardinal")), 'point2'] = ' og' + word_den
            df.loc[((df['word'].str.match("^\d+\/" + no_den + "$")) & (df['type'] == "fraction")), 'point2'] = word_den
            df.loc[((df['word'].str.match(fraction_ptrn_before + "\d+\/" + no_doz_den + no_den + "$")) & (df['type'] == "cardinal")), 'point2'] = word_doz_den + ' og' + word_den
            df.loc[((df['word'].str.match("^\d+\/" + no_doz_den + no_den + "$")) & (df['type'] == "fraction")), 'point2'] = word_doz_den + ' og' + word_den         
         
half_zip = [(hálfur, ' hálfur'),
            (hálfan, ' hálfan'),
            (hálfum, ' hálfum'),
            (hálfs, ' hálfs'),
            (hálf, ' hálf'),
            (hálfa, ' hálfa'),
            (hálfri, ' hálfri'),
            (hálfrar, ' hálfrar'),
            (hálft, ' hálft'),
            (hálfu, ' hálfu'),
            (hálfir, ' hálfir'),
            (hálfra, ' hálfra')]

def replace_half_fraction(df):
    print("replace_half_fraction")
    for rule, string in tqdm(half_zip):
        df.loc[((df['word'].str.match(half_ptrn)) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal") & (df['type'] == "cardinal")), 'points'] = ' og' + string
        df.loc[((df['word'].str.match(r"^(1\/2|½)$")) & (df['tag'].str.match(rule)) & (df['type'] == "fraction")), 'points'] = string
        df.loc[((df['word'].str.match(half_ptrn)) & (df['tag'].str.match("^" + nonoun + "$")) & (df['type'] == "cardinal")), 'points'] = ' og hálfur'
        df.loc[((df['word'].str.match(r"^(1\/2|½)$")) & (df['tag'].str.match("^" + nonoun + "$")) & (df['type'] == "fraction")), 'points'] = 'hálfur'
        df.loc[((df['word'].str.match(half_ptrn)) & (df['tag'].str.match(rule)) & (df['type'] == "cardinal") & (df['type'] == "cardinal")), 'point2'] = ''
        df.loc[((df['word'].str.match(r"^(1\/2|½)$")) & (df['tag'].str.match(rule)) & (df['type'] == "fraction")), 'point2'] = ''
        df.loc[((df['word'].str.match(half_ptrn)) & (df['tag'].str.match("^" + nonoun + "$")) & (df['type'] == "cardinal")), 'point2'] = ''
        df.loc[((df['word'].str.match(r"^(1\/2|½)$")) & (df['tag'].str.match("^" + nonoun + "$")) & (df['type'] == "fraction")), 'point2'] = ''


In [12]:
annar = r"[nl]ken-?(g?s?|[svo]?[fme]?)"
annan = r"[nl]keo-?(g?s?|[svo]?[fme]?)"
öðrum = r"[nl]((ke)|([kvh]f))þ-?(g?s?|[svo]?[fme]?)"
annars = r"[nl]kee-?(g?s?|[svo]?[fme]?)"
aðrir = r"[nl]kfn-?(g?s?|[svo]?[fme]?)"
aðra = r"[nl](kf|ve)o-?((g?s?)|([svo]?[fme]?))"
annarra = r"[nl][kvh]fe-?(g?s?|[svo]?[fme]?)"
önnur = r"[nl](ven|hf[no])-?(g?s?|[svo]?[fme]?)"
annarri = r"[nl]veþ-?(g?s?|[svo]?[fme]?)"
annarrar = r"[nl]vee-?(g?s?|[svo]?[fme]?)"
aðrar = r"[nl]vf[no]-?(g?s?|[svo]?[fme]?)"
annað = r"[nl]he[no]-?(g?s?|[svo]?[fme]?)"
öðru = r"[nl]heþ-?(g?s?|[svo]?[fme]?)"
annars = r"[nl][kh]ee-?(g?s?|[svo]?[fme]?)"

two_ordinal_zip = [(annar, ' annar'),
                    (annan, ' annan'),
                    (öðrum, ' öðrum'),
                    (annars, ' annars'),
                    (aðrir, ' aðrir'),
                    (aðra, ' aðra'),
                    (annarra, ' annarra'),
                    (önnur, ' önnur'),
                    (annarri, ' annarri'),
                    (annarrar, ' annarrar'),
                    (aðrar, ' aðrar'),
                    (annað, ' annað'),
                    (öðru, ' öðru'),
                    (annars, ' annars')]

def replace_two_ordinal(df):
    print("replace_two_ordinal")
    for rule, string in tqdm(two_ordinal_zip):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "2\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'ones'] = string
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "2\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ones'] = ' annan'

In [13]:
fyrsti = r"[nl]ken-?(g?s?|[svo]?[fme]?)"
fyrsta = r"[nl](ke[oþe]|ven|he[noþe])-?(g?s?|[svo]?[fme]?)"
fyrstu = r"[nl](([kvh]f[noþe])|(ve[oþe]))-?(g?s?|[svo]?[fme]?)"

ordinal_ones_zip = [(' fyrst', '1'),
               (' þriðj', '3'), 
               (' fjórð', '4'), 
               (' fimmt', '5'), 
               (' sjött', '6'),
               (' sjöund', '7'),
               (' áttund', '8'),
               (' níund', '9')]


ordinal_tens_zip = [(' tíund', '10'), 
               (' elleft', '11'),
               (' tólft', '12'),
               (' þrettánd', '13'), 
               (' fjórtánd', '14'),
               (' fimmtánd', '15'),
               (' sextánd', '16'),
               (' sautjánd', '17'),
               (' átjánd', '18'),
               (' nítjánd', '19')]

ordinal_letters = [(fyrsti, 'i'), 
           (fyrsta, 'a'), 
           (fyrstu, 'u')]

def replace_zero_ordinal(df):
    print("replace_zero_ordinal")
    for rule, letter in tqdm(ordinal_letters):
        df.loc[((df['word'].str.match("^0\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'ones'] = 'núllt' + letter
    df.loc[((df['word'].str.match("^0\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ones'] = 'núllta'
    
def replace_ones_ordinal(df):
    print("replace_ones_ordinal")
    for string, number in tqdm(ordinal_ones_zip):
        for rule, letter in ordinal_letters:
            df.loc[((df['word'].str.match(ones_ptrn_no11 + number + "\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'ones'] = string + letter
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + "\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ones'] = string + 'a'
    
def replace_tens_ordinal(df):
    print("replace_tens_ordinal")
    for string, number in tqdm(ordinal_tens_zip):
        for rule, letter in ordinal_letters:
            df.loc[((df['word'].str.match(tns_ptrn + number + "\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'dozens'] = string + letter
        df.loc[((df['word'].str.match(tns_ptrn + number + "\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'dozens'] = string + 'a'

In [14]:
dozens_zip = [(' tuttugu', '2'),
               (' þrjátíu', '3'),
               (' fjörutíu', '4'), 
               (' fimmtíu', '5'),
               (' sextíu', '6'), 
               (' sjötíu', '7'),
               (' áttatíu', '8'),
               (' níutíu', '9')]

def replace_dozens_cardinal(df):
    print("replace_dozens_cardinal")
    for string, number in tqdm(dozens_zip):
        # tuttugu
        df.loc[(df['word'].str.match(tns_ptrn + number + "0" + dec_ptrn) & (df['type'] == 'cardinal')), 'dozens'] = string
        # tuttugu og fjórir
        df.loc[(df['word'].str.match(tns_ptrn + number + "[1-9]" + dec_ptrn) & (df['type'] == 'cardinal')), 'dozens'] = string + ' og'
    
dozens_ordinal_zip = [(' tuttug', '2'),
                      (' þrítug', '3'),
                      (' fertug', '4'),
                      (' fimmtug', '5'),
                      (' sextug', '6'),
                      (' sjötug', '7'),
                      (' áttug', '8'),
                      (' nítug', '9')]

dozens_ordinal_letters = [(fyrsti, 'asti'),
                          (fyrsta, 'asta'),
                          (fyrstu, 'ustu')]

def replace_dozens_ordinal(df):
    print("replace_dozens_ordinal")
    for string, number in tqdm(dozens_ordinal_zip):
        for rule, letter in dozens_ordinal_letters:
            df.loc[((df['word'].str.match(tns_ptrn + number + "0\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'dozens'] = string + letter
            df.loc[((df['word'].str.match(tns_ptrn + number + "[1-9]\.$")) & (df['tag'].str.match(rule)) & (df['type'] == 'ordinal')), 'dozens'] = string + letter + ' og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'dozens'] = string + 'asta'
        df.loc[((df['word'].str.match(tns_ptrn + number + "[1-9]\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'dozens'] = string + 'asta og'  
    

In [15]:
# CARDINAL

def replace_onehundred_onethousand_cardinal(df):
    print("replace_onehundred_onethousand_cardinal")
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "1([01][1-9]|[1-9]0)" + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = ' eitt hundrað og'
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "1[2-9]0\.$")) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = ' eitt hundrað og'
    df.loc[((df['word'].str.match("^1([01][1-9]|[1-9]0)" + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = 'hundrað og'
    df.loc[((df['word'].str.match("^1([2-9]0)\.$")) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = 'hundrað og'
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "1([2-9][1-9]|00)" + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = ' eitt hundrað'
    df.loc[((df['word'].str.match("^1([2-9][1-9]|00)" + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = 'hundrað'
    
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + thsnds_and_ptrn_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' eitt þúsund og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' eitt þúsund og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.?((000$)|(([1-9](?!00)\d{2})|(0[2-9][1-9]))" + dec_ptrn_ordinal + ")")) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' eitt þúsund'
    
    df.loc[((df['word'].str.match("^1\.?000$")) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' þúsund'
    
def replace_ten_hundredthousand_cardinal(df):
    print("replace_ten_hundredthousand_cardinal")
    df.loc[((df['word'].str.match(tns_ptrn + "10" + thsnds_and_ptrn_cardinal + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = ' tíu þúsund og'
    df.loc[((df['word'].str.match(tns_ptrn + "10" + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = ' tíu þúsund og'
    df.loc[((df['word'].str.match(tns_ptrn + "10" + thsnds_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = ' tíu þúsund'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + thsnds_and_ptrn_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað þúsund og'
    df.loc[((df['word'].str.match("^100" + thsnds_and_ptrn_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað þúsund og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað þúsund og'
    df.loc[((df['word'].str.match("^100" + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað þúsund og'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "100" + thsnds_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað þúsund'
    df.loc[((df['word'].str.match("^100" + thsnds_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað þúsund'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_and_thsnd + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað og'
    df.loc[((df['word'].str.match("^1" + hndrd_and_thsnd + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' hundrað og'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_thsnd_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað'
    df.loc[((df['word'].str.match("^1" + hndrd_thsnd_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað'

    
def replace_one_million_cardinal(df):
    print("replace_one_million_cardinal")
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + million_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' ein milljón og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + million_and_ordinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' ein milljón og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + milln_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' ein milljón'
    
    df.loc[((df['word'].str.match(tns_ptrn + "10" + million_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = 'tíu milljónir og'
    df.loc[((df['word'].str.match(tns_ptrn + "10" + million_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = 'tíu milljónir og' 
    df.loc[((df['word'].str.match(tns_ptrn + "10" + milln_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = 'tíu milljónir'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + million_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað milljónir og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + million_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað milljónir og'
    df.loc[((df['word'].str.match("^100" + million_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundrað milljónir og'
    df.loc[((df['word'].str.match("^100" + million_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundrað milljónir og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + milln_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað milljónir'
    df.loc[((df['word'].str.match("^100" + milln_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundrað milljónir'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_and_million + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað og'
    df.loc[((df['word'].str.match("^1" + hndrd_and_million + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundrað og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_million + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað'
    df.loc[((df['word'].str.match("^1" + hndrd_million + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundrað'

    
def replace_one_billion_cardinal(df):
    print("replace_one_billion_cardinal")
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + billion_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = ' einn milljarður og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + billion_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = ' einn milljarður og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1" + billion_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = ' einn milljarður'
    
    df.loc[((df['word'].str.match(tns_ptrn + "10" + billion_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = 'tíu milljarðar og'
    df.loc[((df['word'].str.match(tns_ptrn + "10" + billion_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = 'tíu milljarðar og' 
    df.loc[((df['word'].str.match(tns_ptrn + "10" + billion_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = 'tíu milljarðar'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + billion_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundrað milljarðar og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + billion_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundrað milljarðar og'
    df.loc[((df['word'].str.match("^100" + billion_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað milljarðar og'
    df.loc[((df['word'].str.match("^100" + billion_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað milljarðar og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100" + billion_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundrað milljarðar'
    df.loc[((df['word'].str.match("^100" + billion_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað milljarðar'
    
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_and_billion + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundrað og'
    df.loc[((df['word'].str.match("^1" + hndrd_and_billion + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "1" + hndrd_billion + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundrað'
    df.loc[((df['word'].str.match("^1" + hndrd_billion + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað'

    

In [16]:
# ORDINAL

def replace_onehundred_onethousand_ordinal(df):
    print("replace_onehundred_onethousand_ordinal")
    for rule, letter in tqdm(dozens_ordinal_letters):
        df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "1([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = ' eitt hundrað' + letter + ' og'
        df.loc[((df['word'].str.match("^1([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = 'hundrað' + letter + ' og'
        df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "100\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = ' eitt hundrað' + letter
        df.loc[((df['word'].str.match("^100\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = 'hundrað' + letter
        
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' eitt þúsund' + letter + ' og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.?000\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = ' eitt þúsund' + letter
        
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "1([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = ' eitt hundraðasta og'
    df.loc[((df['word'].str.match("^1([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = 'hundraðasta og'
    df.loc[((df['word'].str.match(hndrds_ptrn_no11_def + "100\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = ' eitt hundraðasta'
    df.loc[((df['word'].str.match("^100\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = 'hundraðasta'
    
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'thousands'] = ' eitt þúsundasta og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'thousands'] = ' eitt þúsundasta'

    
def replace_ten_hundredthousand_ordinal(df):
    print("replace_ten_hundredthousand_ordinal")
    for rule, letter in tqdm(dozens_ordinal_letters):
        df.loc[((df['word'].str.match(tns_ptrn + "10\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = ' tíu þúsund' + letter + ' og'
        df.loc[((df['word'].str.match(tns_ptrn + "10\.?000\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = ' tíu þúsund' + letter   
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað þúsund' + letter + ' og'
        df.loc[((df['word'].str.match("^100\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað þúsund' + letter + ' og'         
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.?000\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = ' eitt hundrað þúsund' + letter
        df.loc[((df['word'].str.match("^100\.?000\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = 'hundrað þúsund' + letter
        
    df.loc[((df['word'].str.match(tns_ptrn + "10\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten thousands'] = ' tíu þúsundasta og'
    df.loc[((df['word'].str.match(tns_ptrn + "10\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten thousands'] = ' tíu þúsundasta'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = ' eitt hundrað þúsundasta og'
    df.loc[((df['word'].str.match("^100\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = 'hundrað þúsundasta og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = ' eitt hundrað þúsundasta'
    df.loc[((df['word'].str.match("^100\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = 'hundrað þúsundasta'

    
def replace_one_million_ordinal(df):
    print("replace_one_million_ordinal")
    for rule, letter in tqdm(dozens_ordinal_letters):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' einmilljón' + letter + ' og'
        df.loc[((df['word'].str.match("^1\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' milljón' + letter + ' og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1(\.000){2}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' einmilljón' + letter
        df.loc[((df['word'].str.match("^1(\.000){2}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = ' milljón' + letter
        df.loc[((df['word'].str.match(tns_ptrn + "10\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = ' tímilljón'+ letter + ' og'
        df.loc[((df['word'].str.match(tns_ptrn + "10(\.000){2}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = ' tímilljón'+ letter
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundraðmilljón' + letter + ' og'
        df.loc[((df['word'].str.match("^100\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = 'hundraðmilljón' + letter + ' og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){2}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = ' eitt hundraðmilljón' + letter
        df.loc[((df['word'].str.match("^100(\.000){2}\.$")) & (df['tag'].str.match(rule))) & (df['type'].str.match("^(cardinal|ordinal)$")), 'hundred millions'] = 'hundraðmilljón' + letter
    
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'millions'] = ' einmilljónasta og'
    df.loc[((df['word'].str.match("^1\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'millions'] = ' milljónasta og'
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1\.000\.000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'millions'] = ' einmilljónasta'
    df.loc[((df['word'].str.match("^1\.000\.000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'millions'] = ' milljónasta'
    df.loc[((df['word'].str.match(tns_ptrn + "10\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten millions'] = ' tímilljónasta og'
    df.loc[((df['word'].str.match(tns_ptrn + "10\.000\.000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten millions'] = ' tímilljónasta'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = ' eitt hundraðmilljónasta og'
    df.loc[((df['word'].str.match("^100\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = 'hundraðmilljónasta og'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){2}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = ' eitt hundraðmilljónasta'
    df.loc[((df['word'].str.match("^100(\.000){2}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = 'hundraðmilljónasta'

    
def replace_one_billion_ordinal(df):
    print("replace_one_billion_ordinal")
    for rule, letter in tqdm(dozens_ordinal_letters):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = ' einmilljarð' + letter + ' og'
        df.loc[((df['word'].str.match("^1(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = 'milljarð' + letter + ' og'
        df.loc[((df['word'].str.match(tns_ptrn + "10(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = ' tímilljarð' + letter + ' og' 
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundraðmilljarð' + letter + ' og'
        df.loc[((df['word'].str.match("^100(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundraðmilljarð' + letter + ' og'        
        df.loc[((df['word'].str.match(ones_ptrn_no11 + "1(\.000){3}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = ' einmilljarð' + letter
        df.loc[((df['word'].str.match("^1(\.000){3}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = 'milljarð' + letter
        df.loc[((df['word'].str.match(tns_ptrn + "10(\.000){3}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = 'tímilljarð' + letter
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){3}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = ' eitt hundraðmilljarð' + letter
        df.loc[((df['word'].str.match("^100(\.000){3}\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = 'hundrað milljarð' + letter
        
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'billions'] = ' einmilljarðasta og'
    df.loc[((df['word'].str.match("^1(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'billions'] = 'milljarðasta og'
    df.loc[((df['word'].str.match(tns_ptrn + "10(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten billions'] = 'tímilljarðasta og' 
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = ' eitt hundrað milljarðasta og'
    df.loc[((df['word'].str.match("^100(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = 'hundraðmilljarðasta og'        
    df.loc[((df['word'].str.match(ones_ptrn_no11 + "1(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'billions'] = ' einmilljarðasta'
    df.loc[((df['word'].str.match("^1(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'billions'] = 'milljarðasta'
    df.loc[((df['word'].str.match(tns_ptrn + "10(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten billions'] = 'tímilljarðasta'
    df.loc[((df['word'].str.match(hndrds_ptrn_11 + "100(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = ' eitt hundraðmilljarðasta'
    df.loc[((df['word'].str.match("^100(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = 'hundraðmilljarðasta'



In [17]:
hundreds_thousands_zip = [
                (' tvö', '2'),
                (' þrjú', '3'),
                (' fjögur', '4'),
                (' fimm', '5'),
                (' sex', '6'),
                (' sjö', '7'),
                (' átta', '8'),
                (' níu', '9')
               ]
# CARDINAL

def replace_hundred_bignos_cardinal(df):
    print("replace_hundred_bignos_cardinal")
    for string, number in tqdm(hundreds_thousands_zip):
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "([01][1-9]|[1-9]0)" + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = string + ' hundruð og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "[2-9]0\.$"))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = string + ' hundruð og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "([2-9][1-9]|00)" + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = string + ' hundruð'

        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + thsnds_and_ptrn_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + thsnds_and_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + "\.?((000$)|(([1-9](?!00)\d{2})|(0[2-9][1-9]))" + dec_ptrn_ordinal + ")"))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = string + ' þúsund'
        
        #df.loc[(df['word'].str.match(hndrds_ptrn_no11_def + number + "00(\.?000$|" + thsnds_ptrn_after + dec_ptrn_ordinal + ")")), 'hundred thousands'] = string + ' hundruð þúsund'
        df.loc[((df['word'].str.match(hndrds_ptrn_no11 + number + "00" + thsnds_ptrn_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruð þúsund'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + thsnds_and_ptrn_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruð þúsund og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruð þúsund og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_and_thsnd + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruð og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_thsnd_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruð'
        
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + million_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruð milljónir og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + million_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruð milljónir og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + milln_ptrn_after + dec_ptrn_ordinal))), 'hundred millions'] = string +  ' hundruð milljónir'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_and_million + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruð og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_million + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruð'
        
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + billion_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruð milljarðar og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + billion_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruð milljarðar og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00" + billion_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruð milljarðar'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_and_billion + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruð og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + hndrd_billion + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruð'

def replace_ten_bignos_cardinal(df):
    print("replace_ten_bignos_cardinal")
    for string, number in tqdm(dozens_zip):
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + thsnds_and_ptrn_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + thsnds_and_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + thsnds_ptrn_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' þúsund'
        df.loc[((df['word'].str.match(tns_ptrn + number + "[1-9]\.?\d{3}" + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' og'

        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + million_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + ' milljónir og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + million_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + ' milljónir og' 
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + milln_ptrn_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + ' milljónir'
        df.loc[((df['word'].str.match(tns_ptrn + number + "[1-9](\.\d{3}){2}" + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + ' og'

        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + billion_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + ' milljarðar og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + billion_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + ' milljarðar og' 
        df.loc[((df['word'].str.match(tns_ptrn + number + "0" + billion_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + ' milljarðar'
        df.loc[((df['word'].str.match(tns_ptrn + number + "[1-9](\.\d{3}){3}" + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + ' og'


millions_zip = [(' tvær', '2'),
                (' þrjár', '3'),
                (' fjórar', '4'),
                (' fimm', '5'),
                (' sex', '6'),
                (' sjö', '7'),
                (' átta', '8'),
                (' níu', '9')]

def replace_millions_cardinal(df):
    print("replace_millions_cardinal")
    for string, number in tqdm(millions_zip):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + million_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = string + ' milljónir og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + million_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = string + ' milljónir og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + milln_ptrn_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = string + ' milljónir'

        
billions_zip = [(' tveir', '2'),
                (' þrír', '3'),
                (' fjórir', '4'),
                (' fimm', '5'),
                (' sex', '6'),
                (' sjö', '7'),
                (' átta', '8'),
                (' níu', '9')]
    
def replace_billions_cardinal(df):
    print("replace_billions_cardinal")
    for string, number in tqdm(billions_zip):
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + billion_and_cardinal + dec_ptrn))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + ' milljarðar og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + billion_and_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + ' milljarðar og'
        df.loc[((df['word'].str.match(ones_ptrn_no11 + number + billion_after + dec_ptrn_ordinal))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + ' milljarðar'

# ORDINAL

def replace_hundred_bignos_ordinal(df):
    print("replace_hundred_bignos_ordinal")
    for string, number in tqdm(hundreds_thousands_zip):
        for rule, letter in dozens_ordinal_letters:
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = string + ' hundruð' + letter + ' og'
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundreds'] = string + ' hundruð' + letter
            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = string + ' þúsund' + letter + ' og'
            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.?000\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'thousands'] = string + ' þúsund' + letter
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruðþúsund' + letter + ' og'     
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.?000\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred thousands'] = string + ' hundruðþúsund' + letter

            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruðmilljón' + letter + ' og'
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){2}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred millions'] = string + ' hundruðmilljón' + letter

            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruðmilljarð' + letter + ' og'
            df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){3}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'hundred billions'] = string + ' hundruðmilljarð' + letter

        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = string + ' hundruðasta og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundreds'] = string + ' hundruðasta'
        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'thousands'] = string + ' þúsundasta og'
        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'thousands'] = string + ' þúsundasta'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = string + ' hundruðþúsundasta og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred thousands'] = string + ' hundruðþúsundasta'

        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = string + ' hundruðmilljónasta og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){2}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred millions'] = string + ' hundruðmilljónasta'

        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = string + ' hundruð milljarðasta og'
        df.loc[((df['word'].str.match(hndrds_ptrn_11 + number + "00(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'hundred billions'] = string + ' hundruðmilljarðasta'

        
def replace_ten_bignos_ordinal(df):
    print("replace_ten_bignos_ordinal")
    for string, number in tqdm(dozens_zip):
        for rule, letter in dozens_ordinal_letters:
            df.loc[((df['word'].str.match(tns_ptrn + number + "0\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' þúsund' + letter + ' og'
            df.loc[((df['word'].str.match(tns_ptrn + number + "0\.?000\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten thousands'] = string + ' þúsund' + letter

            df.loc[((df['word'].str.match(tns_ptrn + number + "0\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + 'milljón'+ letter + ' og'
            df.loc[((df['word'].str.match(tns_ptrn + number + "0(\.000){2}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten millions'] = string + 'milljón'+ letter

            df.loc[((df['word'].str.match(tns_ptrn + number + "0(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + 'milljarð' + letter + ' og' 
            df.loc[((df['word'].str.match(tns_ptrn + number + "0(\.000){3}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'ten billions'] = string + 'milljarð' + letter

        df.loc[((df['word'].str.match(tns_ptrn + number + "0\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten thousands'] = string + ' þúsundasta og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0\.?000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten thousands'] = string + ' þúsundasta'

        df.loc[((df['word'].str.match(tns_ptrn + number + "0\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten millions'] = string + 'milljónasta og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0\.000\.000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten millions'] = string + 'milljónasta'

        df.loc[((df['word'].str.match(tns_ptrn + number + "0(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten billions'] = string + 'milljarðasta og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "0(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'ten billions'] = string + 'milljarðasta'

        
mb_ordinal_zip = [(' tví', '2'),
                  (' þrí', '3'),
                  (' fer', '4'),
                  (' fimm', '5'),
                  (' sex', '6'),
                  (' sjö', '7'),
                  (' átt', '8'),
                  (' ní', '9')]

def replace_millionsbillions_ordinal(df):
    print("replace_millionsbillions_ordinal")
    for string, number in tqdm(mb_ordinal_zip):
        for rule, letter in dozens_ordinal_letters:
            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = string + 'milljón' + letter + ' og'
            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "(\.000){2}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'millions'] = string + 'milljón' + letter

            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + 'milljarð' + letter + ' og'
            df.loc[((df['word'].str.match(ones_ptrn_11 + number + "(\.000){3}\.$")) & (df['tag'].str.match(rule))  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + 'milljarð' + letter

        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun))  & (df['type'].str.match("^(cardinal|ordinal)$") & (df['type'] == 'ordinal'))), 'millions'] = string + 'milljónasta og'
        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "\.000\.000\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'millions'] = string + 'milljónasta'

        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "(\.000){2}\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')  & (df['type'].str.match("^(cardinal|ordinal)$"))), 'billions'] = string + 'milljarðasta og'
        df.loc[((df['word'].str.match(ones_ptrn_11 + number + "(\.000){3}\.$")) & (df['tag'].str.match(nonoun)) & (df['type'] == 'ordinal')), 'billions'] = string + 'milljarðasta'

        

In [18]:
ten_hundreds_thousands_zip = [(' ellefu', '11'),
                    (' tólf', '12'),
                    (' þrettán', '13'),
                    (' fjórtán', '14'),
                    (' fimmtán', '15'),
                    (' sextán', '16'),
                    (' sautján', '17'),
                    (' átján', '18'),
                    (' nítján', '19')]

def replace_ten_hundreds_thousands_cardinal(df):
    print("replace_ten_hundreds_thousands_cardinal")
    for string, number in tqdm(ten_hundreds_thousands_zip):
        df.loc[((df['word'].str.match("^" + number + "([01][1-9]|[1-9]0)" + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'hundreds'] = string + ' hundruð og'
        df.loc[((df['word'].str.match("^" + number + "(00|[2-9][1-9])" + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'hundreds'] = string + ' hundruð'
        df.loc[((df['word'].str.match("^" + number + "([01][1-9]|[1-9]0)" + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'thousands'] = ""
        df.loc[((df['word'].str.match("^" + number + "(00|[2-9][1-9])" + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'thousands'] = ""
        df.loc[((df['word'].str.match(tns_ptrn + number + thsnds_and_ptrn_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(tns_ptrn + number + thsnds_and_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'thousands'] = string + ' þúsund og'
        df.loc[((df['word'].str.match(tns_ptrn + number + thsnds_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'thousands'] = string + ' þúsund'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  million_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'millions'] = string + ' milljónir og'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  million_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'millions'] = string + ' milljónir og'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  milln_ptrn_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'millions'] = string + ' milljónir'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  billion_and_cardinal + dec_ptrn)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'billions'] = string + ' milljarðar og'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  billion_and_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'billions'] = string + ' milljarðar og'
        df.loc[((df['word'].str.match("^[1-9]?" + number +  billion_after + dec_ptrn_ordinal)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'billions'] = string + ' milljarðar'
        
def replace_ten_hundreds_thousands_ordinal(df):
    print("replace_ten_hundreds_thousands_ordinal")
    for string, number in tqdm(ten_hundreds_thousands_zip):
        for rule, letter in dozens_ordinal_letters:
            df.loc[((df['word'].str.match("^" + number + "([01][1-9]|10)\.$")) & (df['tag'].str.match(rule)) & (df['type'].str.match("^(cardinal|ordinal|fraction)"))), 'hundreds'] = string + ' hundruð' + letter + ' og'
            df.loc[((df['word'].str.match(tns_ptrn + number + "\.?0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))), 'thousands'] = string + ' þúsund' + letter + ' og'
            df.loc[((df['word'].str.match("^[1-9]?" + number + "\.000\.0([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))), 'millions'] = string + ' milljón' + letter + ' og'
            df.loc[((df['word'].str.match("^" + number + "([01][1-9]|10)\.$")) & (df['tag'].str.match(rule))), 'thousands'] = ""
            
        df.loc[((df['word'].str.match("^" + number + "([01][1-9]|10)\.$")) & (df['tag'] == 'ordinal')), 'hundreds'] = string + ' hundruðasta og'
        df.loc[((df['word'].str.match(tns_ptrn + number + "\.?0([01][1-9]|10)\.$")) & (df['tag'] == 'ordinal')), 'thousands'] = string + ' þúsundasta og'
        df.loc[((df['word'].str.match("^[1-9]?" + number + "\.000\.0([01][1-9]|10)\.$")) & (df['tag'] == 'ordinal')), 'millions'] = string + ' milljónasta og'
        df.loc[((df['word'].str.match("^" + number + "([01][1-9]|10)\.$")) & (df['tag'] == 'ordinal')), 'thousands'] = ""
 

In [19]:
def run_numbers(df, domain):
    start = time.time()
    replace_zero_cardinal(df)
    replace_ones_cardinal(df)
    replace_no_rule_cardinal_decimal(df)
    replace_no_rule_cardinal(df)
    replace_other_ones_cardinal(df)
    replace_tens_cardinal(df)
    if domain == 'other':
        replace_numerator_fractions(df)
        replace_tens_fractions(df)
        replace_denominator_fractions(df)
        replace_half_fraction(df)
    elif domain == 'sport':
        pass
    replace_two_ordinal(df)
    replace_zero_ordinal(df)
    replace_ones_ordinal(df)
    replace_tens_ordinal(df)
    replace_dozens_cardinal(df)
    replace_dozens_ordinal(df)
    replace_onehundred_onethousand_cardinal(df)
    replace_ten_hundredthousand_cardinal(df)
    replace_one_million_cardinal(df)
    replace_one_billion_cardinal(df)
    replace_onehundred_onethousand_ordinal(df)
    replace_ten_hundredthousand_ordinal(df)
    replace_one_million_ordinal(df)
    replace_one_billion_ordinal(df)
    replace_hundred_bignos_cardinal(df)
    replace_ten_bignos_cardinal(df)
    replace_millions_cardinal(df)
    replace_billions_cardinal(df)
    replace_hundred_bignos_ordinal(df)
    replace_ten_bignos_ordinal(df)
    replace_millionsbillions_ordinal(df)
    replace_ten_hundreds_thousands_cardinal(df)
    replace_ten_hundreds_thousands_ordinal(df)
    print(f"run done in {time.time() - start}")
    
def clean_df(df):   
    df = df.fillna("")
    
    #df['number'] = df['hundred billions'] + \
    df['number'] = df['ten billions'] + \
                   df['billions'] + \
                   df['hundred millions'] + \
                   df['ten millions'] + \
                   df['millions'] + \
                   df['hundred thousands'] + \
                   df['ten thousands'] + \
                   df['thousands'] + \
                   df['hundreds'] + \
                   df['dozens'] + \
                   df['ones'] + \
                   df['points'] + \
                   df['point2'] + \
                   df['point3'] + \
                   df['point4'] + \
                   df['point5'] + \
                   df['point6'] + \
                   df['point7'] + \
                   df['point8'] + \
                   df['point9'] + \
                   df['point10']
    
    return df[['sentence_id', 'token_id', 'word', 'tag', 'type', 'number']]
    

In [21]:
run_numbers(df_sport_run, 'sport')

replace_zero_cardinal
replace_ones_cardinal


HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))


replace_no_rule_cardinal_decimal


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


replace_no_rule_cardinal


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


replace_other_ones_cardinal


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


replace_tens_cardinal


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_two_ordinal


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


replace_zero_ordinal


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


replace_ones_ordinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_tens_ordinal


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_dozens_cardinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_dozens_ordinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_onehundred_onethousand_cardinal
replace_ten_hundredthousand_cardinal
replace_one_million_cardinal
replace_one_billion_cardinal
replace_onehundred_onethousand_ordinal


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


replace_ten_hundredthousand_ordinal


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


replace_one_million_ordinal


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


replace_one_billion_ordinal


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


replace_hundred_bignos_cardinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_ten_bignos_cardinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_millions_cardinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_billions_cardinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_hundred_bignos_ordinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_ten_bignos_ordinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_millionsbillions_ordinal


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_ten_hundreds_thousands_cardinal


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


replace_ten_hundreds_thousands_ordinal


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


run done in 417.230021238327


In [None]:
run_numbers(df_other_run, 'other')

In [22]:
df_sport_clean = clean_df(df_sport_run)

In [None]:
df_other_clean = clean_df(df_other_run)

## TIME

In [23]:
def replace_zero_time(df):
    print("replace_zero_time")
    df.loc[((df['word'].str.match("^0\d[:\.][0-5]\d$")) & (df['type'] == 'time')), 'hour_ten'] = ' núll'
    df.loc[((df['word'].str.match("^00[:\.][0-5]\d$")) & (df['type'] == 'time')), 'hour_one'] = ' núll'
    df.loc[((df['word'].str.match("^([01]?[0-9]|2[0-4])[:\.]0\d$")) & (df['type'] == 'time')), 'minute_ten'] = ' núll'
    df.loc[((df['word'].str.match("^([01]?[0-9]|2[0-4])[:\.]00$")) & (df['type'] == 'time')), 'minute_one'] = ' núll'
    
ones_zip_time = [(' eitt', '1'), 
                     (' tvö', '2'), 
                     (' þrjú', '3'), 
                     (' fjögur', '4'),
                     (' fimm', '5'),
                     (' sex', '6'),
                     (' sjö', '7'),
                     (' átta', '8'),
                     (' níu', '9')]

def replace_ones_time(df):
    print("replace_ones_time")
    for string, number in tqdm(ones_zip_time):
        df.loc[((df['word'].str.match("^[02]?" + number + "[:\.][0-5]\d$")) & (df['type'] == 'time')), 'hour_one'] = string
        df.loc[((df['word'].str.match("^([01]?[0-9]|2[0-4])[:\.][02-5]" + number + "$")) & (df['type'] == 'time')), 'minute_one'] = string
        df.loc[((df['sentbefore'].str.match("^[Kk]l\.$")) & (df['word'].str.match("^0" + number + "$")) & (df['type'] == 'timedigit')), 'minute_one'] = "núll " + string
        df.loc[((df['sent2before'].str.match("^[Kk]l$")) & (df['sentbefore'].str.match("^:$")) & (df['word'].str.match("^0" + number + "$")) & (df['type'] == 'timedigit')), 'minute_one'] = "núll " + string
    
tens_zip_time = [(' tíu', '10'), 
            (' ellefu', '11'),
            (' tólf', '12'),
            (' þrettán', '13'), 
            (' fjórtán', '14'),
            (' fimmtán', '15'),
            (' sextán', '16'),
            (' sautján', '17'),
            (' átján', '18'),
            (' nítján', '19'),
            (' tuttugu', '20')]

def replace_tens_time(df):
    print("replace_tens_time")
    for string, number in tqdm(tens_zip_time):
        df.loc[((df['word'].str.match("^" + number + "[:\.][0-5]\d$")) & (df['type'] == 'time')), 'hour_ten'] = string
        df.loc[((df['word'].str.match("^([01]?[0-9]|2[0-4])[:\.]" + number + "$")) & (df['type'] == 'time')), 'minute_ten'] = string


def replace_hour_twenty_time(df):
    print("replace_hour_twenty_time")
    for string, number in tqdm(tens_zip_time):
        df.loc[((df['word'].str.match("^2[1-4][:\.][0-5]\d$")) & (df['type'] == 'time')), 'hour_ten'] = "tuttugu og"

dozens_zip_time = [(" tuttugu", "2"),
           (" þrjátíu", "3"),
           (" fjörutíu", "4"),
           (" fimmtíu", "5")]

def replace_minutes_tens_time(df):
    print("replace_ones_time")
    for string, number in tqdm(dozens_zip_time):
        df.loc[((df['word'].str.match("^([01]?\d|2[0-4])[:\.]" + number + "0$")) & (df['type'] == 'time')), 'minute_ten'] = string
        df.loc[((df['word'].str.match("^([01]?\d|2[0-4])[:\.]" + number + "[1-9]$")) & (df['type'] == 'time')), 'minute_ten'] = string + " og"


In [24]:
def run_time(df):
    start = time.time()
    replace_zero_time(df)
    replace_ones_time(df)
    replace_tens_time(df)
    replace_hour_twenty_time(df)
    replace_minutes_tens_time(df)
    print(f"run_time done in {time.time() - start}")
    
def clean_df_time(df):
    df = df.fillna("")
    df['number'] = df['number'] + df['hour_ten'] + df['hour_one'] + df['minute_ten'] + df['minute_one']
    return df[['sentence_id', 'token_id', 'word', 'number', 'tag', 'type']]

In [27]:
df_other_clean['sentbefore'] = [""] + list(df_other_clean['word'][:-1])
df_other_clean['sent2before'] = ["", ""] + list(df_other_clean['word'][:-2])

In [25]:
df_sport_clean['sentbefore'] = [""] + list(df_sport_clean['word'][:-1])
df_sport_clean['sent2before'] = ["", ""] + list(df_sport_clean['word'][:-2])

In [175]:
df_sample_clean['sentbefore'] = [""] + list(df_sample_clean['word'][:-1])
df_sample_clean['sent2before'] = ["", ""] + list(df_sample_clean['word'][:-2])

In [26]:
run_time(df_sport_clean)
df_time_sport = clean_df_time(df_sport_clean)

replace_zero_time
replace_ones_time


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


replace_tens_time


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


replace_hour_twenty_time


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


replace_ones_time


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


run_time done in 18.502429962158203


In [28]:
run_time(df_other_clean)
df_time_other = clean_df_time(df_other_clean)

replace_zero_time
replace_ones_time


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


replace_tens_time


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


replace_hour_twenty_time


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


replace_ones_time


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


run_time done in 941.632486820221


## SPORT

In [27]:
def replace_units_sport(df):
    print("replace_units_sport")
    for rule, string, number in tqdm(ones_zip):
        df.loc[((df['word'].str.match("^[1-9]?" + number + "\/[1-9]\d?$")) & (df['tag'].str.match(rule))), 'first_team_one'] = string
        df.loc[((df['word'].str.match("^[1-9]\d?\/[1-9]?" + number + "$")) & (df['tag'].str.match(rule))), 'second_team_one'] = string
        df.loc[((df['word'].str.match("^[1-9]\d?\/[1-9]?" + number + "$")) & (df['tag'].str.match(rule))), 'second_team_one'] = string
        
def replace_units_sport_no_rule(df):
    print("replace_units_sport_no_rule")
    for string, number in tqdm(dec_ones_neutral): 
        df.loc[((df['word'].str.match("^[1-9]?" + number + "\/[1-9]\d?$")) & (df['tag'].str.match("^(" + nonoun + "|[nl]([kvh]e[noþe])?(\-)*(g?s?|[svo]?[fme]?))$"))), 'first_team_one'] = string
        df.loc[((df['word'].str.match("^[1-9]\d?\/[1-9]?" + number + "$")) & (df['tag'].str.match("^(" + nonoun + "|[nl]([kvh]e[noþe])?(\-)*(g?s?|[svo]?[fme]?))$"))), 'second_team_one'] = string
        df.loc[((df['word'].str.match("^[1-9]\d?\/[1-9]?" + number + "$")) & (df['tag'].str.match("^(" + nonoun + "|[nl]([kvh]e[noþe])?(\-)*(g?s?|[svo]?[fme]?))$"))), 'second_team_one'] = string
          
def replace_one_sport(df):
    print("replace_one_sport")
    df.loc[((df['word'].str.match("^[1-9]?1\/[1-9]\d?$")) & (df['tag'] == "^(" + nonoun + "|[nl]([kvh]f[noþe])?(\-)*(g?s?|[svo]?[fme]?)$")), 'first_team_one'] = " ein"
    df.loc[((df['word'].str.match("^[1-9]?1\/[1-9]\d?$")) & (df['tag'] == "^(" + nonoun + "|[nl]([kvh]f[noþe])?(\-)*(g?s?|[svo]?[fme]?)$")), 'first_team_one'] = " eitt"        

def replace_other_ones_sport(df):
    print("replace_other_ones_sport")
    for string, number in tqdm(other_numbers_zip):
        df.loc[(df['word'].str.match("^[1-9]?" + number + "\/[1-9]\d?$")), 'first_team_one'] = string
        df.loc[(df['word'].str.match("^[1-9]\d?\/[1-9]?" + number + "$")), 'second_team_one'] = string
        
def replace_tens_sport(df):
    print("replace_tens_sport")
    for string, number in tqdm(tens_zip):
        df.loc[(df['word'].str.match("^" + number + "\/[1-9]\d?$")), 'first_team_one'] = string
        df.loc[(df['word'].str.match("^[1-9]\d?\/" + number)), 'second_team_one'] = string

def replace_dozens_sport(df):
    print("replace_dozens_sport")
    for string, number in tqdm(dozens_zip):
        df.loc[(df['word'].str.match("^" + number + "0\/[1-9]\d?$")), 'first_team_ten'] = string
        df.loc[(df['word'].str.match("^[1-9]\d?\/" + number + "0$")), 'second_team_ten'] = string
        df.loc[(df['word'].str.match("^" + number + "[1-9]\/[1-9]\d?$")), 'first_team_ten'] = string + " og"
        df.loc[(df['word'].str.match("^[1-9]\d?\/" + number + "[1-9]$")), 'second_team_ten'] = string + " og"
        
def replace_between_sport(df):
    print("replace_between_sport")
    df.loc[(df['word'].str.match("^[1-9]\d?\/[1-9]?\d$")), 'between_teams'] = " <sil>"

In [28]:
def run_sport(df):
    start = time.time()
    replace_units_sport(df)
    replace_one_sport(df)
    replace_units_sport_no_rule(df)
    replace_other_ones_sport(df)
    replace_tens_sport(df)
    replace_dozens_sport(df)
    replace_between_sport(df)
    replace_half_fraction(df)
    print(f"run_sport done in {time.time() - start}")
    
def clean_df_sport(df):
    df = df.fillna("")
    df['number'] = df['number'] + df['first_team_ten'] + df['first_team_one'] + df['between_teams'] + \
                   df['second_team_ten'] + df['second_team_one']
    
    return df[['sentence_id', 'token_id', 'word', 'number', 'tag', 'type']]
    

In [29]:
run_sport(df_time_sport)
df_sport_sport = clean_df_sport(df_time_sport)

replace_units_sport


HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))


replace_one_sport
replace_units_sport_no_rule


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


replace_other_ones_sport


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


replace_tens_sport


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_dozens_sport


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


replace_between_sport
replace_half_fraction


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


run_sport done in 68.86919212341309


## DIGIT

In [30]:
digit_numbers = [('0', ' núll'),
                 ('1', ' einn'),
                 ('2', ' tveir'),
                 ('3', ' þrír'),
                 ('4', ' fjórir'),
                 ('5', ' fimm'),
                 ('6', ' sex'),
                 ('7', ' sjö'),
                 ('8', ' átta'),
                 ('9', ' níu'),
                 ('\-', ' <sil>'),
                 ('\+', ' plús'),
                 ('\.', ' punktur'),
                 (':', ' tvípunktur'),
                 (',', ' komma'),
                 ('\/', ' skástrik')]

def digit_fun(substr):
    substr = re.sub(" ", "<sil> ", substr)
    for digit, word in digit_numbers:
        substr = re.sub(digit, word, substr)
    return substr

In [31]:
def replace_digits(df):
    start = time.time()
    for i in tqdm(range(len(df))):
        if df['type'].iloc[i] == 'digit' or (df['type'].iloc[i] == 'timedigit' and df['number'].iloc[i] == ""):
            df['number'].iloc[i] = digit_fun(df['word'].iloc[i])
    print(f"replace_digits done in {time.time() - start}")

In [32]:
replace_digits(df_sport_sport)
df_digit_sport = df_sport_sport

HBox(children=(FloatProgress(value=0.0, max=13512.0), HTML(value='')))


replace_digits done in 0.3570671081542969


In [36]:
replace_digits(df_time_other)
df_digit_other = df_time_other

HBox(children=(FloatProgress(value=0.0, max=738684.0), HTML(value='')))


replace_digits done in 15.56499695777893


In [178]:
replace_digits(df_time_sample)

HBox(children=(FloatProgress(value=0.0, max=97.0), HTML(value='')))


replace_digits done in 0.022760868072509766


In [33]:
digit_numbers_ord = [('0', 'núllta'),
                 ('1', 'fyrsta'),
                 ('2', 'annan'),
                 ('3', 'þriðja'),
                 ('4', 'fjórða'),
                 ('5', 'fimmta'),
                 ('6', 'sjötta'),
                 ('7', 'sjöunda'),
                 ('8', 'áttunda'),
                 ('9', 'níunda')]

def replace_digits_ord(df):
    start = time.time()
    for number, string in tqdm(digit_numbers_ord):
        df.loc[((df['word'].str.match("^0" + number + "\.$")) & (df['type'] == 'digitzero')), 'number'] = "núll " + string
    print(f"replace_digits_ord done in {time.time() - start}")

In [34]:
replace_digits_ord(df_digit_sport)
df_digitzero_sport = df_digit_sport

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_digits_ord done in 2.466248035430908


In [40]:
replace_digits_ord(df_digit_other)
df_digitzero_other = df_digit_other

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_digits_ord done in 119.8602499961853


In [179]:
replace_digits_ord(df_time_sample)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


replace_digits_ord done in 0.0493779182434082


### LETTERS

In [35]:
def replace_letters(df):
    start = time.time()
    for i in tqdm(range(len(df))):
        if df['type'].iloc[i] == 'letters':
            df['number'].iloc[i] = " ".join(df['word'].iloc[i].replace(".", ""))
    print(f"replace_letters done in {time.time() - start}")

In [36]:
replace_letters(df_sport_sport)
df_letters_sport = df_digitzero_sport

HBox(children=(FloatProgress(value=0.0, max=13512.0), HTML(value='')))


replace_letters done in 0.21475720405578613


In [43]:
replace_letters(df_digitzero_other)
df_letters_other = df_digitzero_other

HBox(children=(FloatProgress(value=0.0, max=738684.0), HTML(value='')))


replace_letters done in 30.062265157699585


In [180]:
replace_letters(df_time_sample)

HBox(children=(FloatProgress(value=0.0, max=97.0), HTML(value='')))


replace_letters done in 0.021661996841430664


### PLAIN

In [37]:
def replace_plain(df):
    start = time.time()
    for i in tqdm(range(len(df))):
        if df['number'].iloc[i] == '':
            df['number'].iloc[i] = df['word'].iloc[i]
    print(f"replace_plain done in {time.time() - start}")

In [74]:
replace_plain(df_letters_sport)
df_plain_sport = df_letters_sport
df_plain_sport['sentafter'] = list(df_plain_sport['word'][1:]) + [""]
df_plain_sport['sentbefore'] = [""] + list(df_plain_sport['word'][:-1])

HBox(children=(FloatProgress(value=0.0, max=13512.0), HTML(value='')))


replace_plain done in 0.3638758659362793


In [47]:
replace_plain(df_letters_other)

HBox(children=(FloatProgress(value=0.0, max=738684.0), HTML(value='')))


replace_plain done in 2554.0069308280945


In [68]:
df_plain_other = df_letters_other
df_plain_other['sentafter'] = list(df_plain_other['word'][1:]) + [""]
df_plain_other['sentbefore'] = [""] + list(df_plain_other['word'][:-1])

In [42]:
def replace_symbols(df, domain):
    start = time.time()
    if domain == 'other':
        df.loc[((df['sentbefore'].str.match(all_numbers_ptrn)) & (df['sentafter'].str.match(all_numbers_ptrn)) & (df['word'].str.match("^(-|–)$"))), 'number'] = "til"
    elif domain == 'sport':
        df.loc[((df['sentbefore'].str.match("^((\d{1,2}(\.\d{3})+)|\d+)$")) & (df['sentafter'].str.match("^((\d{1,2}(\.\d{3})+)|\d+)$")) & (df['word'].str.match("^(-|–|:)$"))), 'number'] = ""
    df.loc[(df['word'] == "&"), 'number'] = "og"
    df.loc[(df['word'] == "/"), 'number'] = "skástrik"
    df.loc[(df['word'] == "+"), 'number'] = "plús"
    df.loc[(df['word'] == "="), 'number'] = "jafnt og"
    df.loc[(df['word'] == "≠"), 'number'] = "ekki jafnt og"
    df.loc[(df['word'] == "ε"), 'number'] = "epsilon"
    df.loc[(df['word'].str.match(r"^\±|\∓$")), 'number'] = "plús mínus"
    df.loc[(df['word'].str.match(r"^\~|\≈$")), 'number'] = "um það bil"
    df.loc[(df['word'] == "→"), 'number'] = "þar af leiðandi"
    df.loc[(df['word'] == "≡"), 'number'] = "skilgreint sem"
    df.loc[(df['word'] == "<"), 'number'] = "minna en"
    df.loc[(df['word'] == ">"), 'number'] = "stærra en"
    df.loc[(df['word'] == "≤"), 'number'] = "minna en eða jafnt og"
    df.loc[(df['word'] == "≥"), 'number'] = "stærra en eða jafnt og"
    df.loc[(df['word'] == "∞"), 'number'] = "óendanlegt"
    df.loc[(df['word'] == "°"), 'number'] = "gráður"
    print(f"replace_symbols done in {time.time() - start}")
    
def replace_greek_letters(df):
    start = time.time()
    df.loc[(df['word'].str.match(r"^(Α|α)$")), 'number'] = "alfa"
    df.loc[(df['word'].str.match(r"^(Β|β)$")), 'number'] = "beta"
    df.loc[(df['word'].str.match(r"^(Γ|γ)$")), 'number'] = "gamma"
    df.loc[(df['word'].str.match(r"^(Δ|δ)$")), 'number'] = "delta"
    df.loc[(df['word'].str.match(r"^(Ε|ε)$")), 'number'] = "epsilon"
    df.loc[(df['word'].str.match(r"^(Ζ|ζ)$")), 'number'] = "zeta"
    df.loc[(df['word'].str.match(r"^(Η|η)$")), 'number'] = "eta"
    df.loc[(df['word'].str.match(r"^(Θ|θ)$")), 'number'] = "þeta"
    df.loc[(df['word'].str.match(r"^(Ι|ι)$")), 'number'] =  "jóta"
    df.loc[(df['word'].str.match(r"^(Κ|κ)$")), 'number'] = "kappa"
    df.loc[(df['word'].str.match(r"^(Λ|λ)$")), 'number'] = "lambda"
    df.loc[(df['word'].str.match(r"^(Μ|μ)$")), 'number'] = "mý"
    df.loc[(df['word'].str.match(r"^(Ν|ν)$")), 'number'] = "ný"
    df.loc[(df['word'].str.match(r"^(Ξ|ξ)$")), 'number'] = "xí"
    df.loc[(df['word'].str.match(r"^(Ο|ο)$")), 'number'] = "ómíkrón"
    df.loc[(df['word'].str.match(r"^(Π π)$")), 'number'] = "pí"
    df.loc[(df['word'].str.match(r"^(Ρ|ρ)$")), 'number'] = "hró"
    df.loc[(df['word'].str.match(r"^(Σ|σ)$")), 'number'] = "sigma"
    df.loc[(df['word'].str.match(r"^(Τ|τ)$")), 'number'] = "tá"
    df.loc[(df['word'].str.match(r"^(Υ|υ)$")), 'number'] = "upsílon"
    df.loc[(df['word'].str.match(r"^(Φ|φ)$")), 'number'] = "fí"
    df.loc[(df['word'].str.match(r"^(Χ|χ)$")), 'number'] = "kí"
    df.loc[(df['word'].str.match(r"^(Ψ|ψ)$")), 'number'] = "psí"
    df.loc[(df['word'].str.match(r"^(Ω|ω)$")), 'number'] = "ómega"
    print(f"replace_greek_letters done in {time.time() - start}")


In [77]:
replace_symbols(df_plain_sport, 'sport')
replace_greek_letters(df_plain_sport)

replace_symbols done in 3.4470958709716797
replace_greek_letters done in 4.970217943191528


In [80]:
sport_number = list(df_plain_sport['number'])
sport_tag = [""] + list(df_plain_sport['tag'][:-1])
df_sport_save = pd.DataFrame(list(zip(sport_number, sport_tag)), columns=["word", "tag"])

In [82]:
df_sport_save.to_csv("output_normalization.csv")

In [99]:
replace_symbols(df_plain_csv, 'other')
replace_greek_letters(df_plain_csv)

replace_symbols done in 184.33853673934937
replace_greek_letters done in 256.29939913749695
