# Exercise 4: Numerical Values

The goal of this exercise is to create a script for detecting and normalizing numerical values like "1/4" or "thirty one". We declare a new annotation type named "NumericValue" with a feature "value" of the type DOUBLE. The input document contains two numerical expressions in each line. The first one represents the text passage that should be annotated with "NumericalValue" and the second one represents the value of feature "value". We try to annotate as many as possible with the correct feature value. 

In [None]:
%%documentText
101;101;
2.3;2.3;
3,000;3000;
3^2;9;
3.3e3;3300;
1/4;0.25;
9^1/2;3;
4x10^3;4000;
5.5*4^5;5632;
thirty one;31;
three hundred;300;
four thousand one hundred and two;4102;
3 million;3000000;
fünfundzwanzig;25;
½ million;500000;
4²;16;

In [None]:
%displayMode DYNAMIC_HTML

WORDTABLE NumberTable = 'Numbers.csv';
WORDTABLE MultiplicatorTable = 'Multiplicators.csv';
WORDTABLE SpecialChararactorFractionTable = 'SpecialCharacterFractions.csv';
WORDTABLE SpecialCharacterTable = 'SpecialCharacters.csv';
WORDTABLE ExponentTable = 'Exponents.csv';

DECLARE NumericValue (DOUBLE value);
DECLARE Fraction(NumericValue numerator, NumericValue denominator);
DECLARE SimpleFraction(INT numerator, INT denominator);

DECLARE ConjunctionWord;
DECLARE NormNum (DOUBLE value);
DECLARE NormNum NumInd, MultiplicatorInd, ExponentInd;

BOOLEAN allowPeriodDecimalSeparator = true;
STRING decimalSeparatorString = "\\.";
STRING thousandsSeparatorString = ",";
STRING language = "en";

DECLARE LanguageContainer (STRING language);
Document{IS(DocumentAnnotation)-> GETFEATURE("language", language)};
LanguageContainer{-> GETFEATURE("language", language)};

DECLARE DecimalSeparator, ThousandsSeparator;

"and|und|et" -> ConjunctionWord;

FOREACH(pm) PM {} {
    pm{REGEXP(decimalSeparatorString) -> DecimalSeparator};
    pm{REGEXP(thousandsSeparatorString) -> ThousandsSeparator};
}


MARKTABLE(NumericValue, 2, NumberTable, true, 2, "", 2, "value" = 1);
MARKTABLE(MultiplicatorInd, 2, MultiplicatorTable, true, 4, "", 2, "value" = 1);
MARKTABLE(SimpleFraction, 3, SpecialChararactorFractionTable, true, 4, "", 2, "numerator" = 1, "denominator" = 2);
MARKTABLE(NumericValue, 2, SpecialCharacterTable, true, 4, "", 2,  "value" = 1);
MARKTABLE(ExponentInd, 2, ExponentTable, true, 4, "", 2,  "value" = 1);

ConjunctionWord{PARTOF(MultiplicatorInd)-> UNMARK(ConjunctionWord)};

DOUBLE value;

// normal numbers like 1,000.95
RETAINTYPE(WS);
(NUM{-PARTOF(NumericValue)} (ThousandsSeparator NUM{REGEXP("...")})* (DecimalSeparator NUM)?)
    {PARSE(value, language) -> CREATE(NumericValue, "value" = value)};
W{-REGEXP("[ex]", true)} @NumericValue{-> UNMARK(NumericValue)};
RETAINTYPE;

BLOCK(additionalPeriodSep) Document{IF(allowPeriodDecimalSeparator)} {
    (NUM{IS(NumericValue)-> UNMARK(NumericValue)} PERIOD NUM{IS(NumericValue)-> UNMARK(NumericValue)})
        {PARSE(value, "en") -> CREATE(NumericValue, "value" = value)};
}

// fractions like 3/4
NumericValue{-> UNMARK(NumericValue)} SPECIAL{REGEXP("/")} NumericValue{-> UNMARK(NumericValue), 
    GATHER(Fraction,1,3, "numerator" = 1, "denominator" = 3)};

f:Fraction{-> CREATE(NumericValue, "value" = (f.numerator.value / f.denominator.value))};
sf:SimpleFraction{-> CREATE(NumericValue, "value" = (sf.numerator / sf.denominator))};


// exponents like 2^3, 2.3e13, 4²
(NumericValue{-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    ExponentInd{-> ASSIGN(value, (POW(value, ExponentInd.value)))})
    {-> CREATE(NumericValue, "value" = value)};

(NumericValue{-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    SPECIAL{REGEXP("\\^")}
    NumericValue{-> ASSIGN(value, (POW(value, NumericValue.value))), UNMARK(NumericValue)})
    {-> CREATE(NumericValue, "value" = value)};

(NumericValue{-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    W{REGEXP("e", true)}
    NumericValue{-> ASSIGN(value, value * (POW(10, NumericValue.value))), UNMARK(NumericValue)})
    {-> CREATE(NumericValue, "value" = value)};

// multiplication like 3x4, 2*2

(NumericValue{-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    ANY{REGEXP("x|\\*", true)}
    NumericValue{-> ASSIGN(value, (value * NumericValue.value)), UNMARK(NumericValue)})
    {-> CREATE(NumericValue, "value" = value)};


// combination with multipliers like 3 million
(NumericValue{-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    SPECIAL?{REGEXP("-"), NEAR(W,0,1,true)}
   NumericValue?{-> ASSIGN(value, value + NumericValue.value), UNMARK(NumericValue)}
   (
       MultiplicatorInd{-> ASSIGN(value, value * (POW(10, MultiplicatorInd.value)))} 
       NumericValue?{-> ASSIGN(value, value + NumericValue.value), UNMARK(NumericValue)}
   )*
   ){-> CREATE(NumericValue, "value" = value)};

// fünfundzwanzig
(NumericValue{PARTOF(W)-> ASSIGN(value, NumericValue.value), UNMARK(NumericValue)} 
    @ConjunctionWord NumericValue.value!=0{PARTOF(W),IF((NumericValue.value%1) == 0)
       -> ASSIGN(value, value + NumericValue.value), UNMARK(NumericValue)})
       {-> CREATE(NumericValue, "value" = value)};