In [2]:
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'

LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'


CHINESE_DIGITS = u'零一二三四五六七八九'
BIG_CHINESE_DIGITS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
BIG_CHINESE_DIGITS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'


# 中文数字系统类型
NUMBERING_TYPES = ['low', 'mid', 'high']

ZERO_ALT = u'〇'
ONE_ALT = u'幺'
TWO_ALTS = [u'两', u'兩']

POSITIVE = [u'正', u'正']
NEGATIVE = [u'负', u'負']
POINT = [u'点', u'點']

class ChineseChar(object):
    """
    中文字符
    每个字符对应简体和繁体,
    e.g. 简体 = '负', 繁体 = '負'
    转换时可转换为简体或繁体
    """

    def __init__(self, simplified, traditional):
        self.simplified = simplified
        self.traditional = traditional
        #self.__repr__ = self.__str__

    def __str__(self):
        return self.simplified or self.traditional or None

    def __repr__(self):
        return self.__str__()

class ChineseNumberUnit(ChineseChar):
    """
    中文数字/数位字符
    每个字符除繁简体外还有一个额外的大写字符
    e.g. '陆' 和 '陸'
    """

    def __init__(self, power, simplified, traditional, big_s, big_t):
        super(ChineseNumberUnit, self).__init__(simplified, traditional)
        self.power = power
        self.big_s = big_s
        self.big_t = big_t

    def __str__(self):
        return '10^{}'.format(self.power)

    @classmethod
    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):

        if small_unit:
            return ChineseNumberUnit(power=index + 1,
                                     simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
        elif numbering_type == NUMBERING_TYPES[0]:
            return ChineseNumberUnit(power=index + 8,
                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
        elif numbering_type == NUMBERING_TYPES[1]:
            return ChineseNumberUnit(power=(index + 2) * 4,
                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
        elif numbering_type == NUMBERING_TYPES[2]:
            return ChineseNumberUnit(power=pow(2, index + 3),
                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
        else:
            raise ValueError(
                'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))


class ChineseNumberDigit(ChineseChar):
    """
    中文数字字符
    """

    def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
        super(ChineseNumberDigit, self).__init__(simplified, traditional)
        self.value = value
        self.big_s = big_s
        self.big_t = big_t
        self.alt_s = alt_s
        self.alt_t = alt_t

    def __str__(self):
        return str(self.value)

    @classmethod
    def create(cls, i, v):
        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])


class ChineseMath(ChineseChar):
    """
    中文数位字符
    """

    def __init__(self, simplified, traditional, symbol, expression=None):
        super(ChineseMath, self).__init__(simplified, traditional)
        self.symbol = symbol
        self.expression = expression
        self.big_s = simplified
        self.big_t = traditional




class MathSymbol(object):
    """
    用于中文数字系统的数学符号 (繁/简体), e.g.
    positive = ['正', '正']
    negative = ['负', '負']
    point = ['点', '點']
    """

    def __init__(self, positive, negative, point):
        self.positive = positive
        self.negative = negative
        self.point = point

    def __iter__(self):
        for v in self.__dict__.values():
            yield v

class NumberSystem(object):
    """
    中文数字系统
    """
    pass

# class OtherSymbol(object):
#     """
#     其他符号
#     """
#
#     def __init__(self, sil):
#         self.sil = sil
#
#     def __iter__(self):
#         for v in self.__dict__.values():
#             yield v


# ================================================================================ #
#                                    basic utils
# ================================================================================ #
def create_system(numbering_type=NUMBERING_TYPES[1]):
    """
    根据数字系统类型返回创建相应的数字系统，默认为 mid
    NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
        low:  '兆' = '亿' * '十' = $10^{9}$,  '京' = '兆' * '十', etc.
        mid:  '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
        high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
    返回对应的数字系统
    """

    # chinese number units of '亿' and larger
    all_larger_units = zip(
        LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
    larger_units = [CNU.create(i, v, numbering_type, False)
                    for i, v in enumerate(all_larger_units)]

    # chinese number units of '十, 百, 千, 万'
    all_smaller_units = zip(
        SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
    smaller_units = [CNU.create(i, v, small_unit=True)
                     for i, v in enumerate(all_smaller_units)]
    # digis
    chinese_digits = zip(CHINESE_DIGITS, CHINESE_DIGITS,
                        BIG_CHINESE_DIGITS_SIMPLIFIED, BIG_CHINESE_DIGITS_TRADITIONAL)
    digits = [CND.create(i, v) for i, v in enumerate(chinese_digits)]
    digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
    digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
    digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]

    # symbols
    positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
    negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
    point_cn = CM(POINT[0], POINT[1], '.', lambda x,
                  y: float(str(x) + '.' + str(y)))
    # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
    system = NumberSystem()
    system.units = smaller_units + larger_units
    #print('larger_units: ', larger_units)
    system.digits = digits
    system.math = MathSymbol(positive_cn, negative_cn, point_cn)
    #print('maths: ', system.math)
    # system.symbols = OtherSymbol(sil_cn)
    return system

CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath

def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):

    def get_symbol(char, system):
        for u in system.units:
            if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
                return u
        for d in system.digits:
            if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
                return d
        for m in system.math:
            if char in [m.traditional, m.simplified]:
                return m

    def string2symbols(chinese_string, system):
        int_string, dec_string = chinese_string, ''
        for p in [system.math.point.simplified, system.math.point.traditional]:
            if p in chinese_string:
                int_string, dec_string = chinese_string.split(p)
                break
        return [get_symbol(c, system) for c in int_string], \
               [get_symbol(c, system) for c in dec_string]

    def correct_symbols(integer_symbols, system):
        """
        一百八 to 一百八十
        一亿一千三百万 to 一亿 一千万 三百万
        """

        if integer_symbols and isinstance(integer_symbols[0], CNU):
            if integer_symbols[0].power == 1:
                integer_symbols = [system.digits[1]] + integer_symbols
                
                
        if len(integer_symbols) > 1:
            if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
                integer_symbols.append(
                    CNU(integer_symbols[-2].power - 1, None, None, None, None))
                
        #print(integer_symbols)        
        
        result = []
        unit_count = 0
        for s in integer_symbols:
            
            if isinstance(s, CND):
                result.append(s)
                unit_count = 0
            elif isinstance(s, CNU):
                current_unit = CNU(s.power, None, None, None, None)
                unit_count += 1

            if unit_count == 1:
                result.append(current_unit)
                    
            if unit_count >= 1 and current_unit.power%4 == 0:
                for i in range(len(result)):
                    if (isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power): # 遇到万、亿的时候，加零
                        result[-i - 1] = CNU(result[-i - 1].power +
                                             current_unit.power, None, None, None, None)
                        
        return result

    def compute_value(integer_symbols):
        """
        Compute the value.
        When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
        e.g. '两千万' = 2000 * 10000 not 2000 + 10000
        """
        value = [0]
        last_power = 0
        for s in integer_symbols:
            if isinstance(s, CND):
                value[-1] = s.value
            elif isinstance(s, CNU):
                value[-1] *= pow(10, s.power)
                if s.power > last_power:
                    value[:-1] = list(map(lambda v: v *
                                                    pow(10, s.power), value[:-1]))
                    last_power = s.power
                value.append(0)
        return sum(value)

    system = create_system(numbering_type)
    int_part, dec_part = string2symbols(chinese_string, system)
    #print(int_part, dec_part)
    int_part = correct_symbols(int_part, system)
    #print('result: ',int_part)
    int_str = str(compute_value(int_part))
    dec_str = ''.join([str(d.value) for d in dec_part])
    
    if dec_part:
        return '{0}.{1}'.format(int_str, dec_str)
    else:
        return int_str


text = '二千零幺点三'

print(chn2num(text))


all_larger_units = zip(LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
for i, v in enumerate(all_larger_units):
    print(i,v)


2001.3
0 ('亿', '億')
1 ('兆', '兆')
2 ('京', '京')
3 ('垓', '垓')
4 ('秭', '秭')
5 ('穰', '穰')
6 ('沟', '溝')
7 ('涧', '澗')
8 ('正', '正')
9 ('载', '載')


In [3]:
import re

class Cardinal:
    """
    CARDINAL类
    """

    def __init__(self, cardinal=None, chntext=None):
        self.cardinal = cardinal
        self.chntext = chntext

    def chntext2cardinal(self):
        return chn2num(self.chntext)
    

class Fraction:
    """
    FRACTION类
    """

    def __init__(self, fraction=None, chntext=None):
        self.fraction = fraction
        self.chntext = chntext

    def chntext2fraction(self):
        denominator, numerator = self.chntext.split('分之')
        return chn2num(numerator) + '/' + chn2num(denominator)

    def fraction2chntext(self):
        numerator, denominator = self.fraction.split('/')
        return num2chn(denominator) + '分之' + num2chn(numerator)
    
def remove_space(text):
    text = text.strip(' ')
    texts = text.split(' ')
    new = []
    for text in texts:
        new.append(text)
    return ''.join(new)
    
def text2money(text):
    
    temp = ''
    value = 0
    
    if text.find("元") != -1:
        yuan, jiao = text.split('元')
    
        if yuan:
            value += int(chn2num(yuan))
    
        if jiao:
            if text.find("角") != -1:
                jiao, fen = jiao.split('角')
                value += 0.1*int(chn2num(jiao))
        
                if fen: 
                    value += 0.01*int(chn2num(fen))
                    value = format(value,'.2f')
                    
            else:
                fen = jiao
                value += 0.01*int(chn2num(fen))
                value = format(value,'.2f')
    
    
    text = str(value)+'元'
    
    return text


def text2date(text):
    
    value = 0
    
    if text.find("年") != -1:
        year, month = text.split('年')
        
        pattern = re.compile(r"[十百千]")
        matcher = pattern.findall(year)
        if matcher:
            year = chn2num(year)+'年'
        else:
            pattern = re.compile(r"[零一二两三四五六七八九]")
            matchers = pattern.findall(year)
            if matchers:
                for matcher in matchers:
                    year = year.replace(matcher, Cardinal(chntext = matcher).chntext2cardinal(), 1)
                    
            year = year +'年'
    
        if month:
            month, day = month.split('月')
            month = chn2num(month)+'月'
        
            if day: 
                day = chn2num(day)+'日'
        text = year+month+day
            
    else:
        month, day = text.split('月')
        month = chn2num(month)+'月'
        
        if day: 
            day = chn2num(day)+'日'
            
        text = month+day
    
    return text


def text2digit(text):
    
    result = ''
    for i in text:
        result += chn2num(i)
    
    return result

    


text = '五十三元两分, 两千零四年十一月八日，电话号码是一三七七八九六四五二幺，给我十万零六点四块钱，五十六分之八百一十七，还有P2P'

# text to money
pattern = re.compile(r"[零一二两三四五六七八九十百千]+[元]\D角\D分|\D角\D分|[零一二两三四五六七八九十]+元\D角|[零一二两三四五六七八九十百千]+元\D分")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, text2money(matcher),1)

#text to date
pattern = re.compile(r"[零一二两三四五六七八九十百千]+年\D{1,2}月\D{1,2}日|[零一二两三四五六七八九十]+年\D{1,2}月|\D{1,2}月\D{1,2}日")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, text2date(matcher),1)
    

#X二X to X2X
pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)

#text to fraction
pattern = re.compile(r"([零一二两三四五六七八九十百千]+分之[零一二两三四五六七八九十百千]+)")
matchers = pattern.findall(text)
if matchers:
    #print('fraction')
    for matcher in matchers:
        text = text.replace(matcher, Fraction(chntext=matcher).chntext2fraction(), 1)


#text to digits
pattern = re.compile(r"([零一二三四五六七八九幺两]{4,30})")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, text2digit(matcher),1)
        
#text to number
pattern = re.compile(r"([零一二三四五六七八九十个百千万点亿兆京垓秭穰沟涧正载]+)")
matchers = pattern.findall(text)
if matchers:
    for matcher in matchers:
        text = text.replace(matcher, chn2num(matcher),1)
        
print(remove_space(text))

53.02元,2004年11月8日，电话号码是13778964521，给我100006.4块钱，817/56，还有P2P
