In [1]:
SMALLER_ENGLISH_NUMERING_UNITS_SIMPLIFIED = [u'hundred', u'thousand']

LARGER_ENGLISH_NUMERING_UNITS_SIMPLIFIED = [u'million', u'billion', u'trillion']

ENGLISH_DIGITS = {u'zero':0, u'one':1, u'two':2, u'three':3, u'four':4, u'five':5, u'six':6, u'seven':7\
                  , u'eight':8, u'nine':9, u'ten':10, u'eleven':11, u'twelve':12, u'thirteen':13, u'fourteen':14\
                  , u'fifteen':15, u'sixteen':16, u'seventeen':17,  u'eighteen':18, u'ninteen':19, u'twenty':20\
                  ,u'thirty':30,u'forty':40, u'fivty':50, u'sixty':60, u'seventy':70, u'eighty':80,u'ninety':90}



NUMBERING_TYPES = ['mid']

POSITIVE = [u'positive']
NEGATIVE = [u'negative']
POINT = [u'point']

class EnglishChar(object):
    """
    every character has its simplified version and its complex version
    """

    def __init__(self, simplified):
        self.simplified = simplified
        #self.__repr__ = self.__str__

    def __str__(self):
        return self.simplified or None

    def __repr__(self):
        return self.__str__()

class EnglishNumberUnit(EnglishChar):
    # this class is designed for normal number like 2325 - two thousand three hundred and twenty five

    def __init__(self, power, simplified):
        super(EnglishNumberUnit, self).__init__(simplified)
        self.power = power

    def __str__(self):
        return '10^{}'.format(self.power)

    @classmethod
    def create(cls, index, value, numbering_type=NUMBERING_TYPES[0], small_unit=False):
        # numbering type - low, mid, high
        
        if small_unit:
            return EnglishNumberUnit(power=index + 2, simplified=value)
        elif numbering_type == NUMBERING_TYPES[0]:
            return EnglishNumberUnit(power=(index + 2) * 3, simplified=value)
        else:
            raise ValueError(
                'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))


class EnglishNumberDigit(EnglishChar):
     # this class is designed for digits, like 2325 - two three two five

    def __init__(self, value, simplified):
        super(EnglishNumberDigit, self).__init__(simplified)
        self.value = value

    def __str__(self):
        return str(self.value)

    @classmethod
    def create(cls, i, v):
        return EnglishNumberDigit(i, v)


class EnglishMath(EnglishChar):

    def __init__(self, simplified, symbol, expression=None):
        super(EnglishMath, self).__init__(simplified)
        self.symbol = symbol
        self.expression = expression
        self.big_s = simplified




class MathSymbol(object):

    def __init__(self, positive, negative, point):
        self.positive = positive
        self.negative = negative
        self.point = point

    def __iter__(self):
        for v in self.__dict__.values():
            yield v

class NumberSystem(object):
    pass

# class OtherSymbol(object):
#     """
#     其他符号
#     """
#
#     def __init__(self, sil):
#         self.sil = sil
#
#     def __iter__(self):
#         for v in self.__dict__.values():
#             yield v


# ================================================================================ #
#                                    basic utils
# ================================================================================ #
def create_system(numbering_type=NUMBERING_TYPES[0]):
    """
    NUMBERING_TYPES = ['low', 'mid', 'high']
        mid:  'trillion' = 'million' * 'thousand' = $10^{9}$, etc.
    default is mid
    """

    all_larger_units = LARGER_ENGLISH_NUMERING_UNITS_SIMPLIFIED
    larger_units = [ENU.create(i, v, numbering_type, False)
                    for i, v in enumerate(all_larger_units)]

    all_smaller_units = SMALLER_ENGLISH_NUMERING_UNITS_SIMPLIFIED
    smaller_units = [ENU.create(i, v, small_unit=True)
                     for i, v in enumerate(all_smaller_units)]
    
    # digits
    english_digits = ENGLISH_DIGITS
    
    digits = [END.create(english_digits[key], key) for key in english_digits]

    # symbols
    positive_cn = EM(POSITIVE[0], '+', lambda x: x)
    negative_cn = EM(NEGATIVE[0], '-', lambda x: -x)
    point_cn = EM(POINT[0], '.', lambda x, y: float(str(x) + '.' + str(y)))
    
    #creating system
    system = NumberSystem()
    system.units = smaller_units + larger_units
    system.digits = digits
    system.math = MathSymbol(positive_cn, negative_cn, point_cn)
    #print('maths: ', system.math)
    # system.symbols = OtherSymbol(sil_cn)
    return system

EC, ENU, END, EM = EnglishChar, EnglishNumberUnit, EnglishNumberDigit, EnglishMath

def chn2num(english_string, numbering_type=NUMBERING_TYPES[0]):
    # this function is the mean function, it can convert the text to normal numbers
    # e.g. two thousand and fivty eight, to 2058

    def get_symbol(char, system):
        for u in system.units:
            if char in [u.simplified]:
                return u
        for d in system.digits:
            if char in [d.simplified]:
                return d
        for m in system.math:
            if char in [m.simplified]:
                return m

    def string2symbols(english_string, system):
        int_string, dec_string = english_string, ''
        # spliting the string into integer and fraction, two parts
        # e.g. english_string = 201.3, int_string = 201, dec_string = 0.3
        for p in [system.math.point.simplified]:
            if p in english_string:
                int_string, dec_string = english_string.split(p)
                # removing '' in the head and tail of the sentences
                int_string, dec_string = int_string.strip(), dec_string.strip()
                break
        return [get_symbol(c, system) for c in int_string.split(' ')], \
               [get_symbol(c, system) for c in dec_string.split(' ')]

    def correct_symbols(integer_symbols, system):

        
        if len(integer_symbols) > 1:
            #adding 10^0 to the end of the list
            if isinstance(integer_symbols[-1], END) and isinstance(integer_symbols[-2], ENU):
                integer_symbols.append(
                    ENU(integer_symbols[-2].power - 1, None))
                     
        
        result = []
        unit_count = 0
        for s in integer_symbols:
            if isinstance(s, END):
                result.append(s)
                unit_count = 0
            elif isinstance(s, ENU):
                current_unit = ENU(s.power, None)
                unit_count += 1

            if unit_count == 1:
                result.append(current_unit)
            
            # When there is 10^3(thousand), 10^6(million), etc. Multiplying by 10^3(thousand), 10^6(million), etc
            if unit_count >= 1 and current_unit.power%3 == 0:  
                for i in range(len(result)):
                    if (isinstance(result[-i - 1], ENU) and result[-i - 1].power < current_unit.power):
                        result[-i - 1] = ENU(result[-i - 1].power +
                                             current_unit.power, None, None, None, None)
                        
        return result

    def compute_value(integer_symbols):
        """
        computing the final value of the number.
        e.g. 'twenty millions' = 2000 * 10000 not 2000 + 10000
        """
        value = [0]
        last_power = 0
        for s in integer_symbols:
            if isinstance(s, END):
                value[-1] = s.value
                if s.value< 100 and s.value > 10:
                    value.append(0)
            elif isinstance(s, ENU):
                value[-1] *= pow(10, s.power)
                if s.power > last_power:
                    value[:-1] = list(map(lambda v: v *
                                                    pow(10, s.power), value[:-1]))
                    last_power = s.power
                value.append(0)
        return sum(value)

    system = create_system(numbering_type)
    english_string = english_string.replace(' and ',' ').lower() # removing 'and'
    int_part, dec_part = string2symbols(english_string, system)
    #print(int_part, dec_part)
    int_part = correct_symbols(int_part, system)
    #print('result: ',int_part)
    int_str = str(compute_value(int_part))
    if dec_part[0]:
        dec_str = ''.join([str(d.value) for d in dec_part])
    
    if dec_part[0]:
        return '{0}.{1}'.format(int_str, dec_str)
    else:
        return int_str


# input text
text = 'Thirteen million and two thousand four hundred and thirty two'

print(chn2num(text)) # output


13002432


In [3]:
import re
from goto import with_goto
@with_goto

def matching(string, matching_list, low_bound=2, up_bound=None):
    text_list = re.split(', |/|\. | |', string)
    string = string.replace(',','')
    string = string.replace('.','')
    if up_bound == None:
        up_bound = len(string.split(' '))
    
    result = []
    starting_point = 0
    
    label.begin
    for length in range(up_bound, low_bound, -1): # searching begins with the longest matching pattern
        if length >= len(string.split(' ')):
            length = len(string.split(' '))
        
        #print(length, len(string.split(' ')), string)
        for i in range(0,len(string.split(' '))-length+1):
            complete = 0 # checking if the searching could reach the end
            for j in range(i,i + length):
                if string.split(' ')[j] not in matching_list:
                    break
                complete += 1
            
            if complete == length:# if reaching to the end, we find the matched part
                temp = []
                for k in range(i,i + length):
                    temp.append(string.split(' ')[k])
                result.append(' '.join(temp))
                
                new_string = string.split(' ')[:i] + string.split(' ')[i+length:] # removing the matching part
                new_string = ' '.join(new_string)
                string = new_string
                goto.begin # starting the matching process again

    return result

ORDINAL = {u'first':1, u'second':2, u'third':3, u'thirds':3, u'fourth':4, u'fourths':4, u'fifth':5, u'fifths':5, \
            u'sixth':6, u'sixths':6, u'seventh':7, u'sevenths':7, u'eighth':8, u'eighths':8,\
            u'ninth':9, u'ninths':9, u'tenth':10, u'tenths':10, u'eleventh':11, u'elevenths':11, u'twelfth':12, \
           u'twelfths':12, u'thirteenth':13, u'thirteenths':13, u'fourteenth': 14, u'fourteenths': 14, \
           u'fifteenth':15, u'fifteenths':15, u'sixteenth':16, u'sixteenths':16, u'seventeenth':17, \
           u'seventeenths':17, u'eighteenth':18, u'eighteenths':18, u'nineteenth':19, u'nineteenths':19, \
          u'twentieth':20,  u'twentieths':20, u'thirtieth':30, u'thirtieths':30, u'fortieth':40, u'fortieths':40, \
           u'fiftieth': 50,  u'fiftieths': 50, u'sixtieth':60, u'sixtieths':60,  u'seventieth':70, \
           u'seventieths':70, u'eightieth':80, u'eightieths':80, u'ninetieth':90, u'ninetieths':90}

MONTH = {u'january':1, u'february':2, u'march':3, u'april':4, u'may':5, u'june':6, u'july':7, u'august':8,\
         u'september': 9, 'october': 10, 'november':11, 'december':12}

ENGLISH_DIGITS_LIST = []
ORDINAL_LIST = []
MONTH_LIST = []

for key in ENGLISH_DIGITS:
    ENGLISH_DIGITS_LIST.append(key)

for key in ORDINAL:
    ORDINAL_LIST.append(key)
    
for key in MONTH:
    MONTH_LIST.append(key)

ENGLISH_NUMERING_UNITS_SIMPLIFIED = SMALLER_ENGLISH_NUMERING_UNITS_SIMPLIFIED + \
LARGER_ENGLISH_NUMERING_UNITS_SIMPLIFIED + ENGLISH_DIGITS_LIST

    

#this function is designed for fraction like 1/3, 12/5
def text2fraction(text, ordinals):
    text = text.replace(',','')
    text = text.replace('.','')  
    
    result = []
    fraction_text = []
    for ordinal in ordinals:
        cardinal = [] # the cardinal is the number before the ordinal
        for i in range(text.split(' ').index(ordinal) - 1, -1, -1):
            if text.split(' ')[i] in ENGLISH_NUMERING_UNITS_SIMPLIFIED or text.split(' ')[i] == 'and':
                cardinal.append(text.split(' ')[i])
            else:
                break
        
        if cardinal[-1] == 'and':
            cardinal = cardinal[:-1] #removing the 'and' if it is the last word in the list
            
        cardinal.reverse() # reverse the list
        
        if cardinal:
            result.append(str(chn2num(' '.join(cardinal))) + '/' + str(ORDINAL[ordinal]))
            fraction_text.append(' '.join(cardinal) + ' ' + ordinal)
    
    return fraction_text, result
    

#removing the space in sentences
def remove_space(text):
    text = text.strip(' ')
    texts = text.split(' ')
    new = []
    for text in texts:
        new.append(text)
    return ''.join(new)


def text2date(text):
    # transferring months to number
    
    result =''
    for key in MONTH:
        if text == key:
            result += str(MONTH[key])
    
    
    return result


def text2digit(text):
    #emaple: two three four to 234
    
    result = ''
    for i in text.split(' '):
        result += chn2num(i)
    
    return result

    

# the input text of ITN
text = 'six six seven eight two nine, sixty two, this is P two P, the result is one hundred thirty two thirds\
 and one seventeenth, today is two thousand twenty two July six. By the way, the price is eleven dollars and forty two cents'
text = text.lower()
print('Input is: ', text)
    

#X two X to X2X
pattern = re.compile(r"(([, ][a-zA-Z]) two ([a-zA-Z][ ,]))")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)

#text to fraction
ordinals = matching(text, ORDINAL_LIST, low_bound = 0, up_bound = 1) # searching the ordinal word in the sentences.
fraction_texts, results = text2fraction(text, ordinals)
for fraction_text in fraction_texts:
    text = text.replace(fraction_text, results[fraction_texts.index(fraction_text)], 1)

#text to digits
# example: two two three four nine to 22349
DIGITS = [u'zero',u'one',u'two',u'three',u'four',u'five',u'six',u'seven',u'eight',u'nine']
matchers = matching(text, DIGITS, low_bound = 2)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, text2digit(matcher),1)

#text to number
#example: two thousand three hundred fifty six - to 2356
#pattern = re.compile(r'\b(?:%s)\b' % '|'.join(ENGLISH_NUMERING_UNITS_SIMPLIFIED))
matchers = matching(text, ENGLISH_NUMERING_UNITS_SIMPLIFIED, low_bound = 0)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, chn2num(matcher),1)


#text to date 
text_list = re.split(', |/|\. | |', text)
for i in range(0, len(text_list)-1):
    if text_list[i] in MONTH_LIST: # searching the month related words
        # transferring ', ' or '. ' to '/'
        if i - 1 >= 0 and re.search(r'[0-9]{1,4}', text_list[i - 1]) != None:
            text = text.replace(str(text_list[i-1]) + ' ' + text_list[i], text_list[i-1] + '/' + text_list[i])
            text = text.replace(str(text_list[i-1]) + ', ' + text_list[i], text_list[i-1] + '/' + text_list[i])
        if i + 1 <= len(text_list)-1 and re.search(r'[0-9]{0,2}', text_list[i + 1]) != None:
            text = text.replace(str(text_list[i]) + ' ' + text_list[i+1], text_list[i] + '/' + text_list[i+1])
            text = text.replace(str(text_list[i]) + ', ' + text_list[i+1], text_list[i] + '/' + text_list[i+1])
    
        
pattern = re.compile(r'\b(?:%s)\b' % '|'.join(MONTH_LIST)) #(transferring month)
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, text2date(matcher),1)
   
print('=============================================================================')
print('Output is: ', text) # output of ITN


Input is:  six six seven eight two nine, sixty two, this is p two p, the result is one hundred thirty two thirds and one seventeenth, today is two thousand twenty two july six. by the way, the price is eleven dollars and forty two cents
Output is:  667829, 62, this is p2p, the result is 132/3 and 1/17, today is 2022/7/6. by the way, the price is 11 dollars and 42 cents
