In [1]:
import re


####################### ####################
def get_text(filepath):
    """
        get the text from input file
    """
    text = ""
    with open(filepath, "r", encoding ='utf-8') as inputfile:
        for line in inputfile:
            text = text + line
    return text


def cut_top(text):
    """
        the text before the seller information is not necessary
    """
    begin = 0
    begin_keyword = "From:|Từ:"
    check_regex = r"(From:|Từ:).+[@].+(Date:|Ngày:)"
    
    tmp = 0
    while True:
        # Find the last "From" to get the exact location of seller information
        from_found = re.search(begin_keyword, text[tmp:])
        if from_found and re.match(check_regex, text[tmp+from_found.start():]):
            begin = tmp + from_found.start()
            tmp = tmp + from_found.end()
        else:
            break
               
    return text[begin:]
        


def truncate(text):
    """
        get rid of the unnecessary text
    """
    result = text
    result = cut_top(result)
    return result

In [2]:
###################### GET VENDOR #######################
def count(text, word):
    if len(word)==0:
        return 0
    return len(re.findall(word, text, re.IGNORECASE))


def preprocess_and_split_email(rear_part):
    result = []
    result = rear_part.split(".")
    return result


def get_vendor_via_email(text, email):
    if "@" not in email:
        return ""
    rear_part = email[re.search("@", email).end():]
    result = ""
    max_count = 0
    strings = preprocess_and_split_email(rear_part)
    
    for i in range(len(strings)-1):
        string = strings[i]
        tmp = count(text, string)
        if max_count<tmp:
            max_count=tmp
            result = string
            
    return result


def get_vendor(text):
    """
        Get seller information from the text
    """
    result = ""
    
    FIELDS = ['From|Từ', 'Subject|Chủ đề', 'Date|Ngày', 'To|Tới']
    key_field = 'From|Từ'
    
    email_regex = r"<.+>"
    begin = 0
    end = len(text)
    
    found = re.search(key_field, text[begin:end], re.IGNORECASE)
    if found:
        # search for the key_field (begin point of result)
        begin = begin + found.end()
        if text[begin]==':':
            begin += 1
            
        for field in FIELDS:
            # search for the other fields (end point of result)
            endfound = re.search(field, text[begin:end], re.IGNORECASE)
            if endfound:
                # update the endpoint (the closest field to the key_field)
                end = min(end, begin + endfound.start())
    
    result = text[begin:end].strip()
    
    email_found = re.search(email_regex, result)
    if email_found:
        email = result[email_found.start() : email_found.end()]
        email = email[1:-1]    # get rid of the pair of brackets surrounding the email string
        name = result[:email_found.start()].strip()
        
        if len(name)>0:
            return name
        else:
            return get_vendor_via_email(text, email)
    
    return ""


def get_subject(text):
    """
        Get the subject of the email
    """
    FIELDS = ['From|Từ', 'Subject|Chủ đề', 'Date|Ngày', 'To|Tới']
    key_field = 'Subject|Chủ đề'
    
    begin = 0
    end = len(text)
    
    found = re.search(key_field, text[begin:end], re.IGNORECASE)
    if found:
        # search for the key_field (begin point of result)
        begin = begin + found.end()
        if text[begin]==':':
            begin += 1
            
        for field in FIELDS:
            # search for the other fields (end point of result)
            endfound = re.search(field, text[begin:end], re.IGNORECASE)
            if endfound:
                # update the endpoint (the closest field to the key_field)
                end = min(end, begin + endfound.start())
    
    return text[begin:end].strip()

In [3]:
def extract_from_email_bodytext(text):
    """
        from body text of the email, extract the information
    """   
    result = {}
    text = truncate(text)

    result.update({'seller': get_vendor(text)})
    
    return result

In [4]:
inputfile = "bodytext/5.txt"
text = get_text(inputfile)
# print(text)
text = truncate(text)

print(extract_from_email_bodytext(text))

{'seller': 'Vietjet Reservations'}


In [5]:
n_test = 34
for i in range(1, n_test+1):
    inputfile = "bodytext/" + str(i) + ".txt"
    text = get_text(inputfile)
    print(extract_from_email_bodytext(text))

{'seller': 'sc'}
{'seller': 'QooBee Gift Shop'}
{'seller': 'Klook.com'}
{'seller': 'cellphones'}
{'seller': 'Vietjet Reservations'}
{'seller': 'Klook.com'}
{'seller': 'Apple'}
{'seller': 'Tiki'}
{'seller': 'Grab'}
{'seller': 'ngrok'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'MoMo'}
{'seller': 'Shopee'}
{'seller': 'Shopee'}
{'seller': 'PIZZA HUT'}
{'seller': 'linkedin'}
{'seller': 'Uber Receipts'}
