In [None]:
#!/usr/bin/env python3

import email
from email.policy import default
import csv

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():

            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True) 
                        #charset = subpart.get_charset()

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True) 

   # No checking done to match the charset with the correct part. 
    for charset in getcharsets(msg):
        try:
            body = body.decode(charset)
        except UnicodeDecodeError:
            handleerror("UnicodeDecodeError: encountered.",msg,charset)
        except AttributeError:
             handleerror("AttributeError: encountered" ,msg,charset)
    return body    


class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

# ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName']
header = ['From', 'Subject', 'Body', 'X-FileName', 'IsPhishing']
with open('enron-emails-bag.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)

    writer.writerow(header)
    with MboxReader("emails-enron.mbox") as mbox:
        for message in mbox:
            if message.get('From'):
                sender = str(message.get('From'))
            else:
                sender = "0"
            
            if message.get('Subject'):
                subject = str(message.get('Subject'))
            else:
                subject = "0"
            
            if getbodyfromemail(message):
                CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
                try:
                    Count_data = CountVec.fit_transform([str(getbodyfromemail(message))])
                    bagofwords = pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out()).to_dict()
                    newbagofwords = {}
                    for bag in bagofwords:
                        newbagofwords[bag] = bagofwords[bag][0]
                    body = newbagofwords
                except:
                    body = "0"
                # body = str(getbodyfromemail(message))
            else:
                body = "0"

            if message.get('X-FileName'):
                filename =  str(message.get('X-FileName'))
            else:
                filename = "0"
            
            data = [sender, subject, body, filename, 0]

            writer.writerow(data)

In [None]:
#!/usr/bin/env python3

import email
from email.policy import default
import csv

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():

            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True) 
                        #charset = subpart.get_charset()

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True) 

   # No checking done to match the charset with the correct part. 
    for charset in getcharsets(msg):
        try:
            body = body.decode(charset)
        except:
            return "None"
    return body    


class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

# ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName']
header = ['From', 'Subject', 'Body', 'X-FileName', 'IsPhishing']
with open('enron-phishing.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)

    writer.writerow(header)
    with MboxReader("emails-phishing.mbox") as mbox:
        for message in mbox:
            # print(str(getbodyfromemail(message)))
            if message.get('From'):
                sender = str(message.get('From'))
            else:
                sender = "None"
            
            if message.get('Subject'):
                subject = str(message.get('Subject'))
            else:
                subject = "None"
            
            if getbodyfromemail(message):
                # CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
                # try:
                #     Count_data = CountVec.fit_transform([str(getbodyfromemail(message))])
                #     bagofwords = pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out()).to_dict()
                #     newbagofwords = {}
                #     for bag in bagofwords:
                #         newbagofwords[bag] = bagofwords[bag][0]
                #     body = newbagofwords
                # except:
                #     body = "None"
                body = str(getbodyfromemail(message))
            else:
                body = "None"

            if message.get('X-FileName'):
                filename =  str(message.get('X-FileName'))
            else:
                filename = "None"
            
            data = [sender, subject, body, filename, 1]

            writer.writerow(data)

In [183]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

def df_to_dataset (dataframe, shuffle=True, batch_size=1024):
    df = dataframe.copy()
    labels = df.pop("label")
    #df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
    df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df),labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

df = pd.read_csv("enron-emails-bag.csv") 

#df.dropna(subset=["deva","altceva"])
#label = []

#for i in range(len(df.columns)):
#    label.append(df.columns[i])

#print (label)
#print(df.head())

df["label"]=(df.IsPhishing).astype(int)
df = df[["From","Subject","Body","X-FileName","label"]]
#X = df[df.columns[:-1]].values
#Y = df[df.columns[-1]].values

#print (X)
#print(Y)

#P1
#X_train, X_temp, Y_train, Y_temp = train_test_split(X,Y,test_size=0.4,random_state=0)
#X_valid, X_test, Y_valid, Y_test = train_test_split(X_temp,Y_temp,test_size=0.5,random_state=0)

#P2 -> 80% training 10% valid 10% test
train, val, test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])

#len(train),len(val),len(test)
print(len(train))
print(len(val))
print(len(test))


train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(5,)))
model.add(tf.keras.layers.Dense(16,activation='relu'))
model.add(tf.keras.layers.Dense(16,activation='relu'))
model.add(tf.keras.layers.Dense(16,activation='softmax'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

#model = tf.keras.models.Sequential ([
#    tf.keras.layers.Dense(16,activation='relu'),
#    tf.keras.layers.Dense(16,activation='relu'),
#    tf.keras.layers.Dense(16,activation='softmax'),
#    tf.keras.layers.Dense(1,activation="sigmoid"),
#])



model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])


model.evaluate(train_data)
#model.evaluate(X_valid, Y_valid)
#print (list(train_data)[0])

#plt.hist(df.points, bins=20)
#plt.title("Points Histogram")
#...
#
#

3423
428
428


  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


ValueError: in user code:

    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1727, in test_function  *
        return step_function(self, iterator)
    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1713, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1701, in run_step  **
        outputs = model.test_step(data)
    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1665, in test_step
        y_pred = self(x, training=False)
    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Andrei\.conda\envs\tf\lib\site-packages\keras\engine\input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "input_1". You passed a data dictionary with keys ['From', 'Subject', 'Body', 'X-FileName', 'label']. Expected the following keys: ['input_1']


: 

In [57]:
import mailparser
import mailbox
import csv
from bs4 import BeautifulSoup

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

mymail = mailbox.mbox("phsihingtrainset/phishing-2021")

def bagOfWords(body):
    CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
    try:
        Count_data = CountVec.fit_transform([body])
        bagofwords = pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out()).to_dict()
        newbagofwords = {}
        for bag in bagofwords:
            newbagofwords[bag] = bagofwords[bag][0]
        return newbagofwords
    except:
        return None

header = ['From', 'Subject', 'Body', 'IsPhishing']
with open('phishing-2021.csv', 'w', encoding='UTF8', newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for message in mymail:
        mail = mailparser.parse_from_string(str(message))
        body = BeautifulSoup(mail.body).text
        bow = bagOfWords(body)
        sender = mail.from_
        subject = mail.subject
        data = [sender, subject, bow, 1]

        writer.writerow(data)
