In [1]:
import re

def get_text(filepath):
    """
        get the text from input file
    """
    text = ""
    with open(filepath, "r", encoding ='utf-8') as inputfile:
        for line in inputfile:
            text = text + line
    return text


def cut_top(text):
    """
        the text before the seller information is not necessary
    """
    begin = 0
    begin_keyword = "From:|Từ:"
    check_regex = r"(From:|Từ:).+[@].+(Date:|Ngày:)"
    
    tmp = 0
    while True:
        # Find the last "From" to get the exact location of seller information
        from_found = re.search(begin_keyword, text[tmp:])
        if from_found and re.match(check_regex, text[tmp+from_found.start():]):
            begin = tmp + from_found.start()
            tmp = tmp + from_found.end()
        else:
            break
               
    return text[begin:]
        


def truncate(text):
    """
        get rid of the unnecessary text
    """
    result = text
    result = cut_top(result)
    return result

In [2]:
##################### GET COST #######################
def string_to_float(numstring):
    """
        convert a string with splitter ','' and '.' to float
    """
#     print(numstring)
    result = 0
    tmp = 0
    begin = 0
    end = len(numstring)
    if len(numstring)>2:
        if not re.match(r"\d{1}", numstring[-3]):
            tmp = tmp + int(numstring[-2])/10 + int(numstring[-1])/100
            end = end - 3
        
    for i in range(begin, end):
        if re.match(r"\d{1}", numstring[i]):
            result = result*10 + int(numstring[i])
            
    return result + tmp

def get_currency(string):
    CURRENCIES = {'[$]':'Dollar', 'USD':'Dollar', 'VND':'VND', '₫':'VND', 'đ':'VND'}
    for currency in CURRENCIES.keys():
        if re.search(currency, string, re.IGNORECASE):
            return CURRENCIES[currency]
    return ""

def get_cost_by_cost_regex_and_keyword(text, cost_regex, keyword):
    keyword_found = list(re.finditer(keyword, text, re.IGNORECASE))
    left = 0
    result = None
    for i in range(len(keyword_found)):
        match = keyword_found[i]
        if i==0:
            left = match.end()
        else:
            right = match.start()
            cost_found = re.search(cost_regex, text[left:right])
            if cost_found:
                result = (cost_found.group(), left + cost_found.start())
            left = match.end()
    if len(keyword_found)>0:
        cost_found = re.search(cost_regex, text[left:-1])
        if cost_found:
            result = (cost_found.group(), left + cost_found.start())
    return result

def get_cost_by_cost_regex(text, cost_regex):
    KEYWORDS = ['amount paid', 'amount', 'charged', 'total payment', 'total', 'you paid', \
                'thanh toán', 'tổng cộng', 'tổng', 'số tiền', 'thanh toan', 'tong cong', 'tong', 'so tien']
    
    result = None
    for keyword in KEYWORDS:
        tmp = get_cost_by_cost_regex_and_keyword(text, cost_regex, keyword)
        if tmp and ((not result) or (result and result[1]<tmp[1])):
            result = tmp
            
    return result
                
                
def get_cost(text):
    COST_REGEXES = [r'(?<!([,\d]))(\d{1,3})([,]\d{3})+([.]\d{2})?(?!\d)', \
                    r'(?<!([.\d]))(\d{1,3})([.]\d{3})+([,]\d{2})?(?!\d)', \
                    r'\d+([.,]\d{2})?(?!\d)']
#                     r'(?<![,\d])\d+([,]\d{2})?(?!\d)', \
#                     r'(?<![.\d])\d+([.]\d{2})?(?!\d)']    
    
    for cost_regex in COST_REGEXES:
        cost_found = get_cost_by_cost_regex(text, cost_regex)
        if cost_found:
            expense = string_to_float(cost_found[0])
            currency = get_currency(text[max(cost_found[1]-10, 0): min(cost_found[1]+len(cost_found[0])+10, len(text))])
            return expense, currency
        
     
    return (0, "")

In [3]:
def extract_from_email_bodytext(text):
    """
        from body text of the email, extract the information
    """   
    result = {}
    text = truncate(text)

    expense_and_currency = get_cost(text)
    result.update({'total': expense_and_currency[0]})
    result.update({'currency': expense_and_currency[1]})
    
    return result

In [4]:
inputfile = "bodytext/5.txt"
text = get_text(inputfile)
# print(text)
text = truncate(text)

print(extract_from_email_bodytext(text))

{'total': 1623800, 'currency': 'VND'}


In [8]:
n_test = 34
for i in range(1, n_test+1):
    inputfile = "bodytext/" + str(i) + ".txt"
    text = get_text(inputfile)
    print(extract_from_email_bodytext(text))

{'total': 3000000.0, 'currency': 'VND'}
{'total': 0.0, 'currency': 'Dollar'}
{'total': 787566, 'currency': 'VND'}
{'total': 4300000, 'currency': ''}
{'total': 1623800, 'currency': 'VND'}
{'total': 141536, 'currency': 'VND'}
{'total': 45000, 'currency': 'VND'}
{'total': 351996.0, 'currency': 'VND'}
{'total': 27000, 'currency': 'VND'}
{'total': 60.0, 'currency': 'Dollar'}
{'total': 30000, 'currency': 'VND'}
{'total': 30000, 'currency': 'VND'}
{'total': 30000, 'currency': 'VND'}
{'total': 30000, 'currency': 'VND'}
{'total': 100000, 'currency': 'VND'}
{'total': 25000, 'currency': 'VND'}
{'total': 20000, 'currency': 'VND'}
{'total': 28000, 'currency': 'VND'}
{'total': 20000, 'currency': 'VND'}
{'total': 25000, 'currency': 'VND'}
{'total': 59398, 'currency': 'VND'}
{'total': 131554, 'currency': 'VND'}
{'total': 65700, 'currency': 'VND'}
{'total': 100000, 'currency': 'VND'}
{'total': 158000, 'currency': 'VND'}
{'total': 56000, 'currency': 'VND'}
{'total': 100000, 'currency': 'VND'}
{'total': 