In [33]:
import re
from datetime import datetime
import email.utils
from pprint import pprint

In [34]:
sample_string = """
Message D

From: John Doe <johndoe@gmail.com>
Sent: Wednesday, January 3, 2018 5:00 PM
To: Doe, Jane <janedoe@gmail.com>
Subject: Re: Fwd: Send this to Bob

Message C

From: Doe, Jane <janedoe@gmail.com>
Sent: Tuesday, January 2, 2018 6:00 PM
To: John Doe <johndoe@gmail.com>
Subject: Fwd: Send this to Bob

Message B

On Monday, Jan 1, 2018 at 12:00 PM, Jason
<jason@gmail.com> wrote:

Message A
"""

In [35]:
email_string = """
Hi Belal,  

I forwarded the information along to the right individuals. However,  Keystone
policy does not allow me to change multiple choice responses points.

Thanks,
Miss Miller

Get Outlook for iOS
* * *
From: Belal Mohammed-Nur <belal.mnur@gmail.com>  
Sent: Wednesday, June 14, 2017 7:08:14 AM  
To: Miller, Kayla  
Subject: Re: Discrepancy in Geography Quiz
Mrs. Miller,

I have not yet seen any change in my test, can you confirm that my test score
will change? Has this discrepancy been recognized?

Thank you,
   Belal Mohammed-Nur

On Wed, Jun 7, 2017 at 10:30 PM, Miller, Kayla
<kmiller@thekeystoneschool.net> wrote:  
> Hi Belal,
>
>  
>
>
> I will send this discrepancy/mistake to the appropriate individuals!
>
>  
>
>
> Thanks,
>
> Miss Miller
>
>  
>
>
> Get  Outlook for iOS
>
> * **
>
> From: Belal Mohammed-Nur <belal.mnur@gmail.com>  
>  Sent: Tuesday, June 6, 2017 8:45:47 AM  
>  To: Miller, Kayla  
>  Subject: Discrepancy in Geography Quiz
>
>  
>
> Hi Mrs.Miller,
>
>  
>
>
> My name is Belal Mohammed-Nur and I am enrolled in the Geography course on
> Keystone. The course ID is GEO01-i-0815-0-0006.
>
>  
>
>
> I am e-mailing you because of a discrepancy in the People and Culture of
> Eastern Asia Quiz, in the Eastern Asia unit. Three marks that were taken off
> of the quiz from Question 10, which asked "Which of the following countries
> has the second largest economy in the world?" I answered Japan, according to
> the phrase "Although China's neighbor, Japan, appears to be a very small
> country, it actually has the 2nd largest economy in the world – after the
> United States!" located in slide ten of the "People and Culture of Eastern
> Europe" lesson, also in the Eastern Asia unit. When I submitted the quiz, I
> got 0 out of the three points on that question, because the answer was
> China, and the response feedback was that "China has the 2nd largest economy
> in the world - after the U.S."
>
>  
>
>
> I should not have had these marks taken off. Can you fix this? Thanks in
> advance,
>
>  
>
>
>      Belal Mohammed-Nur
>
>  
>
>
> Here are some snapshots for proof:
>
>  
>
>
> The question and result:
>
>  
>
>
>  
>
>
>  
>
>
> The lesson:
>
>  
>
>
>  
>
"""

In [36]:
unparsed = "Wed, Jun 7, 2017 10:30 PM"
parsed = datetime(*email.utils.parsedate(unparsed)[:6])
print(parsed)

2017-06-07 10:30:00


In [37]:
def parse_datetime(date_string):
    try:
        return datetime(*email.utils.parsedate(date_string)[:6])
    except:
        try:
            return datetime.strptime(date_string, "%A, %B %d, %Y %I:%M:%S %p")
        except:
            return None

In [38]:
def parse_message(message, headers, type):
    message_dict = {
        "subject": None,
        "sender_name": None,
        "from": None,
        "submit_time": None,
        "receiver_name": None,
        "to": None,
        "message": message
    }
    
    # Extract header information
    if type == "from":
        """
        string looks like:
        From: John Doe <johndoe@gmail.com>
        Sent: Wednesday, January 3, 2018 5:00 PM
        To: Doe, Jane <janedoe@gmail.com>
        Subject: Re: Fwd: Send this to Bob
        """
        for header in headers.split("\n"):
            key, value = header.split(": ", 1)
            if key == "Subject":
                message_dict["subject"] = value
            elif key == "From":
                search = re.search(r"(.*) <(.*)>", value)
                message_dict["sender_name"], message_dict["from"] = search.groups() if search else (value, None)
            elif key == "Sent":
                message_dict["submit_time"] = parse_datetime(value)
            elif key == "To":
                search = re.search(r"(.*) <(.*)>", value)
                message_dict["receiver_name"], message_dict["to"] = search.groups() if search else (value, None)
                
    elif type == "on":
        """
        string looks like:
        On Monday, Jan 1, 2018 at 12:00 PM, Jason
        <jason@gmail.com> wrote:
        """
        headers = headers.replace("\n", " ").replace("On ", "").replace(" at ", " ")
        commas = headers.rfind(",")
        if commas != -1:
            message_dict["submit_time"] = parse_datetime(headers[:commas])
            message_dict["sender_name"], message_dict["from"] = re.search(r"(.*) <(.*)> wrote:", headers[commas+1:]).groups()
    
    return message_dict

In [39]:
def split_email(email_string):
    # Remove quoted text
    email_string = re.sub(r'^>\s', '', email_string, flags=re.MULTILINE)

    # Regex pattern to match the start of a new message
    pattern = r'(From:.*?$(?:\n[^:\n]+:[^\n]+$)*|On.*?at.*?,.*?\n?.*?wrote:)'
    
    # Split the email into messages
    splits = re.split(pattern, email_string, flags=re.MULTILINE | re.DOTALL)
    
    message_dicts = []
    start_index = 0

    # Process first message (usually headerless)
    if not (splits[0].strip().startswith("On") or splits[0].strip().startswith("From")):
        start_index = 1
        message_dicts.append({
            "subject": None,
            "sender_name": None,
            "from": None,
            "submit_time": None,
            "receiver_name": None,
            "to": None,
            "message": splits[0]
        })
    
    # Process subsequent messages
    for i in range(start_index, len(splits), 2):
        headers = splits[i]
        message = splits[i + 1]
        if headers.strip().startswith("On"):
            header_type = "on"
        elif headers.strip().startswith("From"):
            header_type = "from"
        else:
            header_type = "unknown"
        message_dicts.append(parse_message(message, headers, header_type))
    
    return message_dicts

In [40]:
result = split_email(email_string)
for i, message in enumerate(result, 1):
    print(f"Message {i}:")
    for key, value in message.items():
        print(f"  {key}: {value}")
    print("-" * 50)

Message 1:
  subject: None
  sender_name: None
  from: None
  submit_time: None
  receiver_name: None
  to: None
  message: 
Hi Belal,  

I forwarded the information along to the right individuals. However,  Keystone
policy does not allow me to change multiple choice responses points.

Thanks,
Miss Miller

Get Outlook for iOS
* * *

--------------------------------------------------
Message 2:
  subject: Re: Discrepancy in Geography Quiz
  sender_name: Belal Mohammed-Nur
  from: belal.mnur@gmail.com
  submit_time: 2017-06-14 07:08:14
  receiver_name: Miller, Kayla  
  to: None
  message: 
Mrs. Miller,

I have not yet seen any change in my test, can you confirm that my test score
will change? Has this discrepancy been recognized?

Thank you,
   Belal Mohammed-Nur


--------------------------------------------------
Message 3:
  subject: None
  sender_name:  Kayla
  from: kmiller@thekeystoneschool.net
  submit_time: 2017-06-07 10:30:00
  receiver_name: None
  to: None
  message:   
Hi Be