# Process an email (.eml) and predict if it's phishing or not using our model

In [52]:
import joblib
import os

from email import policy
from email.parser import BytesParser

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import string
import pandas as pd

In [None]:
## run only if nltk is not yet downloaded
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

In [163]:
#'multipart/report', 'multipart/mixed', 'multipart/related', 'multipart/alternative',
content_types = ['text/plain', 'text/html']
def extract_email_content(eml_path):
    with open(eml_path, 'rb') as eml_file:
        msg = BytesParser(policy=policy.default).parse(eml_file)

    content = ""
    for part in msg.walk():
        # print(part.get_content_type())
        if part.get_content_type() in content_types:
            content = part.get_content()

    return content


### We tokenize, lowercase, lemmatize and remove stopwords, punctuation symbols and numbers

In [104]:
def email_preprocess(email_content):
    # tokenize
    eml_content_tokens = word_tokenize(email_content)
    # print(eml_content_tokens)

    # remove stopwords and lowercase
    stop_words = set(stopwords.words('english'))
    eml_content_tokens = [w.lower() for w in eml_content_tokens if not w.lower() in stop_words]

    # remove numbers
    eml_content_tokens = [token for token in eml_content_tokens if not token.isnumeric()]

    # remove punctuation
    eml_content_tokens = [token for token in eml_content_tokens if token.strip(string.punctuation)]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    eml_content_tokens = [lemmatizer.lemmatize(word) for word in eml_content_tokens]
    # print(eml_content_tokens)

    # split urls into tokens
    for i in range(len(eml_content_tokens)):
        token = eml_content_tokens[i]
        tok_list = "".join((char if char.isalpha() else " ") for char in token).split()
        eml_content_tokens.pop(i)
        for tok_part in tok_list:
            eml_content_tokens.insert(i, tok_part)
            i += 1
        i -= 1

    # print(eml_content_tokens)
    return eml_content_tokens

### Label email as phishing or not using our model

In [156]:
def phishing_estimation(preprocessed_email):
    # load the vectorizer from joblib
    with open(os.path.join('models', 'tfidf_vectorizer.joblib'), 'rb') as vect_file:
        tfidf_vectorizer = joblib.load(vect_file)

    # load the model from joblib
    with open(os.path.join('models', 'MLP_TFIDF.joblib'), 'rb') as mod_file:
        lr = joblib.load(mod_file)

    preprocessed_email = str(preprocessed_email)
    eml_series = pd.Series([preprocessed_email])
    eml_tfidf = tfidf_vectorizer.transform(eml_series)
    estimated_label = lr.predict(eml_tfidf)
    # estimated_label = lr.predict_proba(eml_tfidf)

    return estimated_label[0]

## Main

In [None]:
base_dir = os.getcwd()
eml_file_path = os.path.join(base_dir, "data", "email_example2.eml")
eml_content = extract_email_content(eml_file_path)
# eml_content = "shawn hathaway dwwebmoviebizmwebmoviebizcom restore potency impress partner increase amount ejaculate using wondercum natural product ordered online site product improves mens health natural effective way wondercum wonderful combination herbs known increasing sperm volume efficacy male impotence also contains herbs make calm relieve stress prerequisite desire arousal httpbadetallcom prescription needed wed 06 aug 2008 102259 0300 sperm means longer orgasms"
# eml_content = "hpl nom march 27 2001 see attached file hplno 327 xls hplno 327 xls"
# eml_content = " Dear Dalewoowood iwuhiepychqbigrztfqbkiuzs, Welcome to the Enterprise Plus? membership experience  Your Enterprise Plus member number and user name is RVEBkJmdVrsjTfHkfOInqEDmx  Your membership delivers faster reservations and rentals, a special members-only line at major airport locations and exclusive discounts  In addition, you'll be able to start earning points you can redeem for Free Rental Days after you activate your rewards  Please allow 24 hours for system updates before activating  To get the most from your next rental, simply go to http    and log in with your member number  Thank you for choosing Enterprise  We look forward to making your next rental experience more rewarding "
preprocessed_email = email_preprocess(eml_content)
phishing_prob = phishing_estimation(preprocessed_email)
print(f"Phishing: {phishing_prob}")

Phishing: 1
