In [2]:
from pathlib import Path
import re
from email import policy
from email.parser import BytesParser

In [6]:
EMAIL_FOLDERS_PATH = Path.cwd().parent.joinpath('data', 'raw', 'unpacked')
EMAIL_FILE_PATTERN = re.compile(r'^\d+\.[0-9a-f]+$')

In [8]:
def extract_paths_and_labels(folder=EMAIL_FOLDERS_PATH, pattern=EMAIL_FILE_PATTERN):
    email_paths_and_labels = []
    for path in folder.rglob('*'):
        
        if path.is_file() and pattern.match(path.name):
            parent_folder_name = path.parent.name
            date = parent_folder_name
            
            if 'spam' in parent_folder_name:
                label = 'spam'
            elif 'ham' in parent_folder_name:
                label = 'ham'
            else:
                label = 'none'

            email_paths_and_labels.append((path, label))
            
    return email_paths_and_labels

In [15]:
def configure_raw_email(email_path):
    with email_path.open(mode="rb") as email_file:
        raw_email = email_file.read()

    configured_email = BytesParser(policy=policy.default).parsebytes(raw_email)
    return configured_email

In [24]:
def extract_email_body(configured_email):
    email_body = ""   # default empty string
    
    if configured_email.is_multipart():
        for email_part in configured_email.walk():   #.walk() traverses the MIME tree of the email.Each part is like a node:Could be text/plain, text/html, or binary attachments.
            ctype = email_part.get_content_type()    # Gets the MIME type of that part.
            disp = str(email_part.get('Content-Disposition')) #The Content-Disposition header describes how the content should be presented."inline" → meant to be displayed (like body text or inline images)."attachment" → means a downloadable file (e.g. PDF, DOCX).By converting to str, we avoid NoneType issues if the header is missing.
            # pick out text/plain and ignore attachments
            if ctype == "text/plain" and "attachment" not in disp:
                email_body = email_part.get_content()
                break
    else:
        try:
            email_body = configured_email.get_content()
        except LookupError:
            #.get_payload() retrieves the raw content of the message part.decode=True tells it to decode the transfer encoding:Base64Quoted-printableBut it doesn’t handle character encoding yet (still bytes).
            payload = configured_email.get_payload(decode=True)
            #Now we manually decode bytes → Unicode string.Assumes UTF-8 (most common modern encoding).errors="replace" ensures decoding won’t crash if invalid characters are found:Invalid bytes get replaced with �.
            email_body = payload.decode("utf-8", errors='replace')

    return email_body            

In [25]:
import pandas as pd
def make_df():
    emails = []
    labels = []
    email_paths_and_labels = extract_paths_and_labels()
    
    for email_path, label in email_paths_and_labels:
        configured_email = configure_raw_email(email_path)
        email_body = extract_email_body(configured_email)
        emails.append(email_body)
        labels.append(label)

    return pd.DataFrame({'email': emails, 'label': labels})

In [26]:
df = make_df()

In [27]:
df

Unnamed: 0,email,label
0,Greetings!\n\nYou are receiving this letter be...,spam
1,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...",spam
2,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",spam
3,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",spam
4,"<html><xbody>\n<hr width = ""100%"">\n<center><h...",spam
...,...,...
7944,"URL: http://www.newsisfree.com/click/-3,871436...",ham
7945,"URL: http://www.newsisfree.com/click/-4,872399...",ham
7946,"URL: http://www.newsisfree.com/click/-3,871080...",ham
7947,"URL: http://www.newsisfree.com/click/-4,872400...",ham
