In [None]:
import email
from email.policy import default

# class for reading a mails file with a specific format (mbox) and allows for iteration over the messages in the file.
class MboxReader:
    # It opens the file in binary mode and reads the first line, which should start with "From ".   
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    # enter and exit methods are for context management and allow the use of the "with" statement. 
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)


In [3]:
import re

# function to remove html tags
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [4]:
# function to remove unicode or backslashes from the text
def not_unicode_or_backslash(x):
    try:
        x = x.encode('unicode-escape').decode()
    finally:
        return not x.startswith("\\")

In [5]:
import base64
# base64 decoder
def decodeb64(strin):
    try:
        fin = base64.b64decode(strin)
        return fin
    except:
        return strin

In [6]:

# Cleaning the body content by hardcoding
def clean_body(body):
    body = body.replace('{', ' {')
    body = re.sub(r"{.*}", "{{}}", body)
    body = re.sub("[\(\[].*?[\)\]]", "", body)
    body = body.replace('\n', ' ')
    body = body.replace('jose@monkey.org', '')
    body = body.replace('jose', ' ')
    body = body.replace('monkey.org', ' ')
    body = body.replace('&nbsp;', ' ')
    body = body.replace('&amp;', ' ')
    body = body.replace('=0D', ' ')
    body = body.replace('=A0', ' ')
    body = body.replace('=20', ' ')
    body = body.replace('=E9', ' ')
    body = body.replace('&copy;', ' ')
    body = body.replace('udca0', ' ')
    body = body.replace('udce2', ' ')
    body = body.replace('= ', '')
    body = body.replace('sp;', '')
    body = body.replace('&nb', ' ')
    body = body.replace("=E2=80=99","'")
    body = body.replace("&#8217;","'")
    body = body.replace("&#957;","v")
    &#957;

    
    body = ' '.join(word for word in body.split(' ') if not word.startswith('&'))
    body = ' '.join(word for word in body.split(' ') if not word.startswith('#'))
    body = ' '.join(word for word in body.split(' ') if not word.startswith('.'))
    body = ' '.join(word for word in body.split(' ') if not word.startswith('='))
    body = ' '.join(word for word in body.split(' ') if not word.startswith('|'))
    
    body = re.sub(r'http\S+', '', body)
    # body = body.replace('\\', '')
    body = body.strip()
    body = " ".join(body.split())
    body = re.sub(r"{.*}", "{{}}", body)
    
    body = body.replace("{", "")
    body = body.replace('}', '')

    
    body = ' '.join(word for word in body.split(' ') if not_unicode_or_backslash(word))
    try: 
        body = body.split('--')[0]
    except:
        pass

    try: 
        body = body.split('**')[0]
    except:
        pass
    return body

In [7]:
pre_bodies = []

# reading the mbox file
with MboxReader('/Users/atacank/Documents/Nazario phishing corpus/private-phishing4.mbox') as mbox:
        counter = 0
        for message in mbox:
            try:
                pre_bodies.append(message._payload[0]._payload)
            except:
                try:
                    pre_bodies.append(message._payload)
                
                except:
                    pass
            

In [8]:
# function for getting body content of the emails
def get_bodies(dir_of_mbox):
    bodies = []
    with MboxReader(dir_of_mbox) as mbox:
        counter = 0
        for message in mbox:

            counter = counter + 1
            
            message = decodeb64(message)

            try:
                body = remove_tags(' '.join(word for word in message._payload[0]._payload.split(' ') if not word.startswith('0x')))
                # print(body)
                body = clean_body(body)
                # print(body)
                if len(body)<100:
                    pass
                else:
                    bodies.append(body)
            except:
                try: 
                    body = remove_tags(' '.join(word for word in message._payload.split(' ') if not word.startswith('0x')))
                    body = clean_body(body)
                    if len(body)<100:
                        pass
                    else:
                        bodies.append(body)
                except Exception as e:
                    # print(e, counter)
                    pass
    return bodies



In [9]:
# loading all body contents from mbox files
bodies_pre2015 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/private-phishing4.mbox')
bodies2016 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2016.mbox')
bodies2017 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2017.mbox')
bodies2018 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2018.mbox')
bodies2019 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2019.mbox')
bodies2020 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2020.mbox')
bodies2021 = get_bodies('/Users/atacank/Documents/Nazario phishing corpus/phishing-2021.mbox')

In [10]:
# concatenating the list
bodies_all = bodies_pre2015 + bodies2016 + bodies2017 + bodies2018 + bodies2019 + bodies2020 + bodies2021

In [12]:
# removing duplicates
bodies_all = list(set(bodies_all))

In [13]:
import pandas as pd
from langdetect import detect

In [14]:
# seperating the english and other languages 
bodies_all_eng = []
bodies_all_other_lang = []
for message in bodies_all:
    if detect(message) == 'en':
        bodies_all_eng.append(message)
    else:
        bodies_all_other_lang.append(message)

In [16]:
len(bodies_all_other_lang)

394

In [17]:
import pandas as pd

df = pd.DataFrame(bodies_all, columns=["email_body"])


In [18]:
import text2emotion as te

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1125)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1125)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1125)>


In [None]:
def freq(lst):
    d = {}
    for i in lst:
        if d.get(i):
            d[i] += 1
        else:
            d[i] = 1
    return d

In [19]:
phishing_emails = pd.DataFrame(columns=['message','is_phishing'])

In [20]:
phishing_emails['message'] = bodies_all_eng
phishing_emails['is_phishing'] = 1

In [21]:
phishing_emails['message'] = phishing_emails['message'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))

In [145]:
phishing_emails.to_csv('/Users/atacank/Documents/Repositories/Thesis-Phishing/phishing_emails.csv', index=False, encoding='utf-8')

In [44]:
phishing_emails['message'][1650]

"wellsfargo.com Your contact information has been updated We have updated your Wells Fargo contact information: Phone Number Email Address You have received this email because you or someone had used your account from different computer. update your contact information. If you did not make this request online, by phone, or at a Wells Fargo branch, please call us immediately at 1-800-869-3557 or 1-800-225-5935 We are available 24 hours a day, 7 days a week. Please do not reply to this email. Note: If you use Bill Pay, you will need to update your contact information for that service separately. You'll find a link on the right side of the Update Your Contact Information screen. wellsfargo.com"