import pandas as pd

f = !cat ./datasets/fradulent_emails.txt
f = '\n'.join(iter(f))
f = f.split('From r ')

In [199]:
import re
from typing import Optional
import string
from datetime import datetime

def remove_punctuation(s: str) -> str:
    """Remove punctuation characters from a string."""
    return s.translate(str.maketrans('', '', string.punctuation))

class Mail:
    subject: Optional[str] = None
    return_email: Optional[str] = None
    to_email: Optional[str] = None
    date: Optional[datetime] = None

    def __init__(self, value: str) -> None:
        """Initialize a Mail object with the given value."""
        self.__value__ = value

    def extract_metadata(self) -> None:
        """Extract metadata (subject and return email) from the mail content."""
        subjects = re.findall(r'Subject: (.*)', self.__value__)
        if subjects:
            self.subject = remove_punctuation(''.join(subjects))

        return_email = re.search(r'Return-Path: .*', self.__value__)
        if return_email:
            self.return_email = self.__extract_email(return_email.group(0))

        date = re.search(r'Date: .*', self.__value__)
        if date:
            self.date = date.group(0).split(" ", 2)[2]

        to_email = re.search(r'To: .*', self.__value__)
        if to_email:
            self.to_email = self.__extract_email(to_email.group(0))

        self.__value__ = re.sub(r'(\n){1,9}', '\n', self.__value__)

    @staticmethod
    def __extract_email(string: str) -> str:
        """Extract an email from a string."""
        email = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', string)
        if email:
            return email.group(0)

    @classmethod
    def from_string(cls, value: str) -> 'Mail':
        """Create a Mail object from a string."""
        return cls(value)

    def __repr__(self) -> str:
        """Return a string representation of the Mail object."""
        return f'Mail(subject={self.subject}, return_email={self.return_email}, to_email={self.to_email}, date={self.date})'


In [200]:
mails = [Mail.from_string(m) for m in f if m]

In [201]:
_ = [i.extract_metadata() for i in mails]

In [202]:
mails

[Mail(subject=URGENT BUSINESS ASSISTANCE AND PARTNERSHIP, return_email=james_ngola2002@maktoob.com, to_email=james_ngola2002@maktoob.com, date=31 Oct 2002 02:38:20 +0000),
 Mail(subject=URGENT ASSISTANCE RELATIONSHIP P, return_email=bensul2004nng@spinfinder.com, to_email=None, date=31 Oct 2002 05:10:00),
 Mail(subject=GOOD DAY TO YOU, return_email=obong_715@epatra.com, to_email=obong_715@epatra.com, date=31 Oct 2002 22:17:55 +0100),
 Mail(subject=GOOD DAY TO YOU, return_email=obong_715@epatra.com, to_email=webmaster@aclweb.org, date=31 Oct 2002 22:44:20),
 Mail(subject=I Need Your Assistance, return_email=m_abacha03@www.com, to_email=m_abacha03@www.com, date=1 Nov 2002 01:45:04 +0100),
 Mail(subject=Partnership, return_email=davidkuta@postmark.net, to_email=davidkuta@yahoo.com, date=02 Nov 2002 06:23:11 +0000),
 Mail(subject=Urgent Attention, return_email=tunde_dosumu@mailcity.com, to_email=tunde_dosumu@lycos.com, date=None),
 Mail(subject=URGENT BUSINESS PRPOSAL, return_email=william2