In [1]:
import re


####################### ####################
def get_text(filepath):
    """
        get the text from input file
    """
    text = ""
    with open(filepath, "r", encoding ='utf-8') as inputfile:
        for line in inputfile:
            text = text + line
    return text


def cut_top(text):
    """
        the text before the seller information is not necessary
    """
    begin = 0
    begin_keyword = "From:|Từ:"
    check_regex = r"(From:|Từ:).+[@].+(Date:|Ngày:)"
    
    tmp = 0
    while True:
        # Find the last "From" to get the exact location of seller information
        from_found = re.search(begin_keyword, text[tmp:])
        if from_found and re.match(check_regex, text[tmp+from_found.start():]):
            begin = tmp + from_found.start()
            tmp = tmp + from_found.end()
        else:
            break
               
    return text[begin:]
        


def truncate(text):
    """
        get rid of the unnecessary text
    """
    result = text
    result = cut_top(result)
    return result

In [2]:
###################### GET DATE #####################
def preprocess_and_split_datestring(datestring):
    result = []
    datestring = datestring.replace(",", "")
    result = datestring.split()
    return result


def is_eng_date(datestring):
    weekday_dict = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    return datestring[:3] in weekday_dict


def is_vie_date(datestring):
    pass


def extract_eng_date(datestring):
    """
        get the day, month, year in the english datestring
    """
        
    month_dict = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06', \
                  'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}
    tokens = preprocess_and_split_datestring(datestring)
    
    year = ''
    month = ''
    day = ''
    
    yearregex = r'(19|20)\d{2}'
    dayregex = r'([12]\d|3[01]|0?[1-9])'
    
    for token in tokens:
        """
            loop over all the integer in the string to check if it is day or month or year
        """
        if (month=='') and (token in month_dict):
            month = month_dict[token]
        elif (day=='') and re.match(dayregex, token):
            day = token
        elif (year=='') and re.match(yearregex, token):
            year = token
            
    if len(day)==1:
        day = '0' + day
    return day + '/' + month + '/' + year


def extract_vie_date(datestring):
    """
        get the day, month, year in the english datestring
    """
    substring_found = re.search(r",.+,.+", datestring)
    datestring = substring_found.group()
    tokens = preprocess_and_split_datestring(datestring)
    
    year = ''
    month = ''
    day = ''
    
    yearregex = r'(19|20)\d{2}'
    monthregex = r'(1[012]|0?[1-9])'
    dayregex = r'([12]\d|3[01]|0?[1-9])'
    
    for token in tokens:
        """
            loop over all the integer in the string to check if it is day or month or year
        """
        if (day=='') and re.match(dayregex, token):
            day = token
        elif (month=='') and re.match(monthregex, token):
            month = token
        elif (year=='') and re.match(yearregex, token):
            year = token
            
    if len(day)==1:
        day = '0' + day
    if len(month)==1:
        month = '0' + month
    return day + '/' + month + '/' + year


def extract_date(datestring):
    if is_eng_date(datestring):
        return extract_eng_date(datestring)
    else:
        return extract_vie_date(datestring)
    

def get_date(text):
    """
        Get date from the text
    """
    FIELDS = ['From|Từ', 'Subject|Chủ đề', 'Date|Ngày', 'To|Tới']
    key_field = 'Date|Ngày'
    
    begin = 0
    end = len(text)
    
    found = re.search(key_field, text[begin:end], re.IGNORECASE)
    if found:
        # search for the key_field (begin point of result)
        begin = begin + found.end()
        if text[begin]==':':
            begin += 1
            
        for field in FIELDS:
            # search for the other fields (end point of result)
            endfound = re.search(field, text[begin:end], re.IGNORECASE)
            if endfound:
                # update the endpoint (the closest field to the key_field)
                end = min(end, begin + endfound.start())
    
    return extract_date(text[begin:end].strip())

In [3]:
def extract_from_email_bodytext(text):
    """
        from body text of the email, extract the information
    """   
    result = {}
    text = truncate(text)
    
    result.update({'date': get_date(text)})
    
    return result

In [4]:
inputfile = "bodytext/5.txt"
text = get_text(inputfile)
# print(text)
text = truncate(text)

print(extract_from_email_bodytext(text))

{'date': '10/03/2019'}


In [5]:
n_test = 34
for i in range(1, n_test+1):
    inputfile = "bodytext/" + str(i) + ".txt"
    text = get_text(inputfile)
    print(extract_from_email_bodytext(text))

{'date': '04/01/2018'}
{'date': '19/04/2019'}
{'date': '16/07/2019'}
{'date': '27/06/2019'}
{'date': '10/03/2019'}
{'date': '16/07/2019'}
{'date': '15/08/2018'}
{'date': '13/09/2019'}
{'date': '27/09/2018'}
{'date': '10/05/2018'}
{'date': '31/05/2019'}
{'date': '31/05/2019'}
{'date': '31/05/2019'}
{'date': '31/05/2019'}
{'date': '01/06/2019'}
{'date': '11/08/2019'}
{'date': '16/07/2019'}
{'date': '12/08/2019'}
{'date': '14/08/2019'}
{'date': '15/08/2019'}
{'date': '19/08/2019'}
{'date': '25/08/2019'}
{'date': '03/09/2019'}
{'date': '08/09/2019'}
{'date': '28/09/2019'}
{'date': '14/10/2019'}
{'date': '15/10/2019'}
{'date': '20/10/2019'}
{'date': '20/10/2019'}
{'date': '29/10/2019'}
{'date': '30/10/2019'}
{'date': '23/05/2019'}
{'date': '08/05/2018'}
{'date': '21/01/2018'}
