In [1]:
import re

In [2]:
def get_text(filepath):
    """
        get the text from input file
    """
    text = ""
    with open(filepath, "r", encoding ='utf-8') as inputfile:
        for line in inputfile:
            text = text + line
    return text


def find_text_begin_with_keyword(begin_keyword, text, ignorecase = False):
    found = re.search(begin_keyword, text, re.IGNORECASE if ignorecase else 0)
    if found:
        return text[found.start():], found.start()
    else:
        return None

    
def find_last_text_begin_with_keyword(begin_keyword, text, ignorecase = False):
    begin = -1
    pivot = 0
    
    while True:
        found = find_text_begin_with_keyword(begin_keyword, text[pivot:], ignorecase)
        if found:
            begin = pivot + found[1]
            pivot = begin + len(begin_keyword)
        else:
            break
               
    if begin>=0:
        return text[begin:], begin
    else:
        return None


def find_last_text_begin_with_keyword_and_match_regex(begin_keyword, text, check_regex, ignorecase = False):
    begin = -1
    pivot = 0
    
    while True:
        found = find_text_begin_with_keyword(begin_keyword, text[pivot:], ignorecase)
        if found and re.match(check_regex, found[0], re.IGNORECASE if ignorecase else 0):
            begin = pivot + found[1]
            pivot = begin + len(begin_keyword)
        else:
            break
               
    if begin>=0:
        return text[begin:], begin
    else:
        return None


def cut_top(text):
    begin_keyword = "From:|Từ:"
    check_regex = r"(From:|Từ:).+[@].+(Date:|Ngày:)"
    data_found = find_last_text_begin_with_keyword_and_match_regex(begin_keyword, text, check_regex, ignorecase=True)
    if data_found:
        return data_found[0]
    else:
        return ""


def truncate(text):
    """
        get rid of the unnecessary text
    """
    result = text
    result = cut_top(result)
    return result

In [3]:
def get_creator(text):
    KEY_FIELDS = ['created_by:', 'Created by:', 'created by:']
    email_regexes = [r"[a-z][a-z0-9_\.]{5,32}@[a-z0-9]{2,}(\.[a-z0-9]{2,4}){1,2}", \
                   r"[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*"]
    
    
    begin = 0
    end = len(text)
    
    for key_field in KEY_FIELDS:        
        found = re.search(key_field, text[begin:end], re.IGNORECASE)
        if found:
            # search for the key_field (begin point of result)
            begin = begin + found.end()
            for regex in email_regexes:
                creator_found = re.search(regex, text[begin:])
                if creator_found:
                    return text[begin+creator_found.start() : begin+creator_found.end()].strip()
        
    return ""

In [4]:
def preprocess_and_split_datestring(datestring):
    result = []
    datestring = datestring.replace(",", "")
    result = datestring.split()
    return result


def is_eng_date(datestring):
    weekday_dict = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    return datestring[:3] in weekday_dict


def is_vie_date(datestring):
    pass


def extract_eng_date(datestring):
    """
        get the day, month, year in the english datestring
    """
        
    month_dict = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06', \
                  'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}
    tokens = preprocess_and_split_datestring(datestring)
    
    year = ''
    month = ''
    day = ''
    
    yearregex = r'(19|20)\d{2}'
    dayregex = r'([12]\d|3[01]|0?[1-9])'
    
    for token in tokens:
        """
            loop over all the integer in the string to check if it is day or month or year
        """
        if (month=='') and (token in month_dict):
            month = month_dict[token]
        elif (day=='') and re.match(dayregex, token):
            day = token
        elif (year=='') and re.match(yearregex, token):
            year = token
            
    if len(day)==1:
        day = '0' + day
    return day + '/' + month + '/' + year


def extract_vie_date(datestring):
    """
        get the day, month, year in the english datestring
    """
    substring_found = re.search(r",.+,.+", datestring)
    datestring = substring_found.group()
    tokens = preprocess_and_split_datestring(datestring)
    
    year = ''
    month = ''
    day = ''
    
    yearregex = r'(19|20)\d{2}'
    monthregex = r'(1[012]|0?[1-9])'
    dayregex = r'([12]\d|3[01]|0?[1-9])'
    
    for token in tokens:
        """
            loop over all the integer in the string to check if it is day or month or year
        """
        if (day=='') and re.match(dayregex, token):
            day = token
        elif (month=='') and re.match(monthregex, token):
            month = token
        elif (year=='') and re.match(yearregex, token):
            year = token
            
    if len(day)==1:
        day = '0' + day
    if len(month)==1:
        month = '0' + month
    return day + '/' + month + '/' + year


def extract_date(datestring):
    if is_eng_date(datestring):
        return extract_eng_date(datestring)
    else:
        return extract_vie_date(datestring)
    

def get_date(text):
    """
        Get date from the text
    """
    FIELDS = ['From|Từ', 'Subject|Chủ đề', 'Date|Ngày', 'To|Tới']
    key_field = 'Date|Ngày'
    
    begin = 0
    end = len(text)
    
    found = re.search(key_field, text[begin:end], re.IGNORECASE)
    if found:
        # search for the key_field (begin point of result)
        begin = begin + found.end()
        if text[begin]==':':
            begin += 1
            
        for field in FIELDS:
            # search for the other fields (end point of result)
            endfound = re.search(field, text[begin:end], re.IGNORECASE)
            if endfound:
                # update the endpoint (the closest field to the key_field)
                end = min(end, begin + endfound.start())
    
    return extract_date(text[begin:end].strip())

In [5]:
def count(text, word):
    if len(word)==0:
        return 0
    return len(re.findall(word, text, re.IGNORECASE))


def preprocess_and_split_email(rear_part):
    result = []
    result = rear_part.split(".")
    return result


def get_vendor_via_email(text, email):
    if "@" not in email:
        return ""
    rear_part = email[re.search("@", email).end():]
    result = ""
    max_count = 0
    strings = preprocess_and_split_email(rear_part)
    
    for i in range(len(strings)-1):
        string = strings[i]
        tmp = count(text, string)
        if max_count<tmp:
            max_count=tmp
            result = string
            
    return result


def get_vendor(text):
    """
        Get seller information from the text
    """
    result = ""
    
    FIELDS = ['From|Từ', 'Subject|Chủ đề', 'Date|Ngày', 'To|Tới']
    key_field = 'From|Từ'
    
    email_regex = r"<.+>"
    begin = 0
    end = len(text)
    
    found = re.search(key_field, text[begin:end], re.IGNORECASE)
    if found:
        # search for the key_field (begin point of result)
        begin = begin + found.end()
        if text[begin]==':':
            begin += 1
            
        for field in FIELDS:
            # search for the other fields (end point of result)
            endfound = re.search(field, text[begin:end], re.IGNORECASE)
            if endfound:
                # update the endpoint (the closest field to the key_field)
                end = min(end, begin + endfound.start())
    
    result = text[begin:end].strip()
    
    email_found = re.search(email_regex, result)
    if email_found:
        email = result[email_found.start() : email_found.end()]
        email = email[1:-1]    # get rid of the pair of brackets surrounding the email string
        name = result[:email_found.start()].strip()
        
        if len(name)>0:
            return name
        else:
            return get_vendor_via_email(text, email)
    
    return ""

In [6]:
def extract_from_email_bodytext(text):
    """
        from body text of the email, extract the information
    """   
    result = {}
    result.update({'created_by': get_creator(text)})
    text = truncate(text)
    
    result.update({'date': get_date(text)})
    result.update({'seller': get_vendor(text)})
#     expense_and_currency = get_cost(text)
#     result.update({'total': expense_and_currency[0]})
#     result.update({'currency': expense_and_currency[1]})
    
    return result

In [7]:
inputfile = "bodytext/1.txt"
text = get_text(inputfile)

In [8]:
print(extract_from_email_bodytext(text))

NameError: name 'get_cost' is not defined

In [None]:
def positive_lookahead(keyword, text, ignorecase = False):
    new_regex = '(?=' + keyword + ')'
    new_regex = "%r"% new_regex
    found = re.search(new_regex, text, re.IGNORECASE if ignorecase else 0)
    return found

def positive_lookbehind(keyword, text, ignorecase = False):
    new_regex = '(?<=' + keyword + ')'
    new_regex = "%r"% new_regex
    found = re.search(new_regex, text, re.IGNORECASE if ignorecase else 0)
    return found

def get_creator(text):
    KEY_FIELDS = ['created\s+by']
    email_regexes = [r"[a-z][a-z0-9_\.]{5,32}@[a-z0-9]{2,}(\.[a-z0-9]{2,4}){1,2}", \
                   r"[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*"]
    
    
    begin = 0
    end = len(text)
    
    for key_field in KEY_FIELDS:
        keyfield_found = find_text_begin_with_keyword(key_field, text, ignorecase=True)
#         keyfield_found = re.search(key_field, text, re.IGNORECASE)
        if keyfield_found:
            begin = keyfield_found[1]
            for regex in email_regexes:
                creator_found = re.search(regex, text[begin:])
                if creator_found:
                    return text[begin+creator_found.start() : begin+creator_found.end()].strip()
        else:
            return ""
       
    return ""

    
text = "---------- Forwarded message --------- Từ: anh quân trần <anhquan6776@gmail.com> Date: Th 2, 4 thg 11, 2019 vào lúc 19:47 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <hr@expense.talent.vn> ---------- Forwarded message --------- Từ: anh quân trần <anhquan6776@gmail.com> Date: Th 2, 4 thg 11, 2019 vào lúc 13:20 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <anh.tran02@base.vn> ---------- Forwarded message --------- Từ: Nguyễn Phi Hùng <nphihung94@gmail.com> Date: Th 7, 5 thg 10, 2019 vào lúc 08:39 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <anhquan6776@gmail.com> ---------- Forwarded message --------- Created by: kimanhnt39@gmail.com From: Ngọc Linh Vũ <vnlinh112@gmail.com> Date: Fri, Oct 4, 2019 at 6:53 PM Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <nphihung94@gmail.com> ---------- Forwarded message --------- From: <Contact.SCB@sc.com> Date: Thu, Jan 4, 2018, 13:14 Subject: Inter Bank Fund Transfer Receipt Confirmation To: <vnlinh112@gmail.com> Dear VU LINH Your request for Inter Bank Fund Transfer has been received succesfully.You can view the transaction status by clicking on the View Previous Transfers link at the Quick Links section. Following are the details of the transfer. Transaction Date 2018-01-04 13:13:45.045 Payment Reference HYH19 VU NGOC LINH VA TRAN Transfer Currency and Amount VND 3000000.00 Debit Account XXXXXXX0899 Payment Reference No 169292836265147752 Credit Account XXXXXXXXX0417 Debit Charges To XXXXXXX0899 Selection All Charges to my account Please contact Call Center on 84.8.39110000/ 84.4.36960000 if you need any further assistance. Thank you for using Standard Chartered Online Banking. This email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please delete all copies and notify the sender immediately. You may wish to refer to the incorporation details of Standard Chartered PLC, Standard Chartered Bank and their subsidiaries at https://www.sc.com/en/incorporation-details.html"   
print(get_creator(text))

In [None]:
text = "---------- Forwarded message --------- Từ: anh quân trần <anhquan6776@gmail.com> Date: Th 2, 4 thg 11, 2019 vào lúc 19:47 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <hr@expense.talent.vn> ---------- Forwarded message --------- Từ: anh quân trần <anhquan6776@gmail.com> Date: Th 2, 4 thg 11, 2019 vào lúc 13:20 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <anh.tran02@base.vn> ---------- Forwarded message --------- Từ: Nguyễn Phi Hùng <nphihung94@gmail.com> Date: Th 7, 5 thg 10, 2019 vào lúc 08:39 Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <anhquan6776@gmail.com> ---------- Forwarded message --------- Created by: kimanhnt39@gmail.com From: Ngọc Linh Vũ <vnlinh112@gmail.com> Date: Fri, Oct 4, 2019 at 6:53 PM Subject: Fwd: Inter Bank Fund Transfer Receipt Confirmation To: <nphihung94@gmail.com> ---------- Forwarded message --------- From: <Contact.SCB@sc.com> Date: Thu, Jan 4, 2018, 13:14 Subject: Inter Bank Fund Transfer Receipt Confirmation To: <vnlinh112@gmail.com> Dear VU LINH Your request for Inter Bank Fund Transfer has been received succesfully.You can view the transaction status by clicking on the View Previous Transfers link at the Quick Links section. Following are the details of the transfer. Transaction Date 2018-01-04 13:13:45.045 Payment Reference HYH19 VU NGOC LINH VA TRAN Transfer Currency and Amount VND 3000000.00 Debit Account XXXXXXX0899 Payment Reference No 169292836265147752 Credit Account XXXXXXXXX0417 Debit Charges To XXXXXXX0899 Selection All Charges to my account Please contact Call Center on 84.8.39110000/ 84.4.36960000 if you need any further assistance. Thank you for using Standard Chartered Online Banking. This email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please delete all copies and notify the sender immediately. You may wish to refer to the incorporation details of Standard Chartered PLC, Standard Chartered Bank and their subsidiaries at https://www.sc.com/en/incorporation-details.html"   
ignorecase = True
print(re.search(r'(?<=(Created_by:|Created by:))', text, re.IGNORECASE if ignorecase else 0))

In [None]:
def get