This notebook implements stacked ensemble using 7 meta learner models and 1 super learner model. Gradient Boost and Naive Bayes have also been used to classify phishing emails.

<b>Import Modules</b>


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,auc,roc_curve, confusion_matrix
import re
from difflib import SequenceMatcher
import random
import numpy as np
import nltk
from sklearn import tree
from scipy.sparse import hstack, vstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import minmax_scale
from sklearn.svm import SVC, LinearSVC

<b>User defined functions for data pre-processing</b>

In [2]:
def list_urls(string):
    regex_2 = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])"
    regex_3 = r"(http|ftp|https): / / ([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])"
    url = []
    try:
        url_2 = re.findall(regex_2, string)
        url_second = [x for element in url_2 for x in element if x !=""]
        url_3 = re.findall(regex_3, string)
        url_third = [x for element in url_3 for x in element if x !=""]
        url = url_second + url_third
    except:
        print("error")
    finally:
        return url

def count_urls(url_list):
    num=0
    for url in url_list:
        num = num + len(url)
    return num

def text_url_similarity(text):
    url_list = list_urls(text)
    new_text = text
    count = 0
    for url in url_list:
        # print(url)
        if url != "":
            if url in text:
                try:
                    new_text = re.sub(url, "", new_text)
                except:
                    count = count + 1
    words = nltk.word_tokenize(new_text)
    sim = 0
    try:
        sim = max([similar(url, word) for word in words for url in url_list])
    except:
        sim = 0
    return sim

def domain_url_similarity(domain, text):
    url_list = list_urls(text)
    sim = 0
    try:
        sim = max([similar(url, domain) for url in url_list])
    except:
        sim = 0
    return sim

def isNaN(string):
    return string != string

def similar(a, b):
    if isNaN(a) | isNaN(b):
        return 0
    else:
        return SequenceMatcher(None, a, b).ratio()
    
def get_models():
    models = list()
    models.append(LogisticRegression(max_iter=300))
    models.append(DecisionTreeClassifier())
    models.append(MultinomialNB())
#     models.append(KNeighborsClassifier())
    models.append(AdaBoostClassifier())
    models.append(BaggingClassifier(n_estimators=10))
    models.append(RandomForestClassifier(n_estimators=10))
    models.append(ExtraTreesClassifier(n_estimators=10))
    return models

def get_out_of_fold_predictions(X, y, models, sparse=False):
    meta_X, meta_y = list(), list()
    # define split of data
    kfold = KFold(n_splits=10, shuffle=True)
    # enumerate splits
    for train_ix, test_ix in kfold.split(X):
        fold_yhats = list()
        # get data
        train_X, text_X, train_y, test_y = list(),list(),list(),list()
        if sparse:
            train_X, test_X = X[list(train_ix)], X[list(test_ix)]
            train_y, test_y = y.iloc[train_ix], y.iloc[test_ix]
        else:
            train_X, test_X = X.iloc[list(train_ix)], X.iloc[list(test_ix)]
            train_y, test_y = y.iloc[train_ix], y.iloc[test_ix]
        
        meta_y.extend(test_y)
        # fit and make predictions with each sub-model
        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict(test_X)
            # store columns
            fold_yhats.append(list(yhat))

        # store fold yhats as columns
        arr_fold_yhats = np.array(fold_yhats)
        meta_X.append(hstack(blocks=[csr_matrix(arr_fold_yhats.T)]))
    
    return vstack(meta_X), np.array(meta_y)

def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

# fit a meta model
def fit_meta_model(X, y):
########## Gradient Boosting ############
    model = GradientBoostingClassifier()
    model.fit(X, y)
    return model
########## SVM ############
#     model = SVC()
#     svm = LinearSVC()
#     model = CalibratedClassifierCV(svm) 
#     model.fit(X, y)
#     return model

def evaluate_models(X, y, models):
    for model in models:
        yhat = model.predict(X)
        model_acc = accuracy_score(y, yhat)
        print('%s: Accuracy - %.3f' % (model.__class__.__name__, model_acc))
        print("\tTrain Accuracy : ",model.score(X,y))
        print("\tVal Accuracy : ",model.score(x_val,y_val))
        
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    yhat_models = list()
    for model in models:
        yhat = model.predict(X)
        yhat_models.append(list(yhat))
    arr_yhat_models = np.array(yhat_models)
    meta_X = hstack(blocks=[csr_matrix(arr_yhat_models.T)])
    return meta_model.predict(meta_X), meta_model.predict_proba(meta_X)


<b> Read Data</b>

Read Phishing Emails Dataset 1 from Kaggle

In [3]:
emails = pd.read_csv('fraud_email_.csv')

Read phishing Emails Dataset 2 from CCAC

In [4]:
ccac_mails = pd.read_excel('CCAC_data.xlsx')
ccac_mails["Text"] = ccac_mails["Subject"].astype(str) + " " + ccac_mails["Body"].astype(str)

In [5]:
emails.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


Assign Email To randomly for kaggle dataset from ccac dataset

In [6]:
emails["EmailTo"] = random.choices(list(ccac_mails[ccac_mails["EmailTo"].notnull()]["EmailTo"]), k=len(emails))

Replace 10% EmailTo as NA for Kaggle Dataset to match with CCAC dataset

In [7]:
indices = list(emails.index)
length = len(emails)
num = int(0.1*length)
idx_replace = random.choices(indices, k=num)

emails.loc[idx_replace, 'EmailTo'] = np.nan

Replace 50% phishing emails with EmailTo as NA for Kaggle Dataset to match with CCAC dataset

In [8]:
indices = list(emails.loc[emails["Class"]==1].index)
length = len(emails[emails["Class"]==1])
num = int(0.5*length)
idx_replace = random.choices(indices, k=num)

emails.loc[idx_replace, 'EmailTo'] = np.nan

From CCAC data, get sender and sender's email address, and assign to Kaggle dataset with phishing emails as lower similarity and non phishing as higher similarity

In [9]:
senders = ccac_mails["SenderName"]
senderEmails = ccac_mails["SenderEmailAddress"]

In [10]:
similarity_scores=np.zeros((len(senders),len(senderEmails)))
for i in range(len(senders)):
    similarity_scores[i][i] = similar(senders[i],senderEmails[i])
    
good_similarity_indices = np.where(similarity_scores>0.2)
bad_similarity_indices = np.where((similarity_scores<0.2) & (similarity_scores!=0))

idx_good_replace = random.choices(good_similarity_indices[0], k=10)
idx_bad_replace = random.choice(bad_similarity_indices[0])


In [11]:
emails["SenderName"] = random.choices(list(ccac_mails[ccac_mails["SenderName"].notnull()]["SenderName"]), k=len(emails))
emails["SenderEmailAddress"] = random.choices(list(ccac_mails[ccac_mails["SenderEmailAddress"].notnull()]["SenderEmailAddress"]), k=len(emails))

In [12]:
mail_indices = list(emails.index)
length = len(emails)
# num = int(0.1*length)

for i in mail_indices:
    idx_replace = random.choice(good_similarity_indices[0])
    senders = list(ccac_mails.loc[idx_replace,["SenderName","SenderEmailAddress"]])
    emails.loc[i, 'SenderName'] = senders[0]
    emails.loc[i, 'SenderEmailAddress'] = senders[1]


In [13]:
indices = list(emails.loc[emails["Class"]==1].index)
length = len(emails[emails["Class"]==1])
num = int(0.5*length)
replace_indices = random.sample(indices, num)
for i in replace_indices:
    idx_replace = random.choice(bad_similarity_indices[0])
    senders = list(ccac_mails.loc[idx_replace,["SenderName","SenderEmailAddress"]])
    emails.loc[i, 'SenderName'] = senders[0]
    emails.loc[i, 'SenderEmailAddress'] = senders[1]

<b>Pre-process</b>

Keyword counts for phishing keywords

In [14]:
assistant_words = ["personal assistant", "administrative assistant"]
assistant_ind =[any(x in str(ccac_mails["Text"][i]).lower() for x in assistant_words) for i in range(len(ccac_mails))]
assist_flag_mails = ccac_mails.loc[assistant_ind].copy()

In [15]:
piano_words = ["loving home"]
piano_ind =[any(x in str(ccac_mails["Text"][i]).lower() for x in piano_words) for i in range(len(ccac_mails))]
piano_flag_mails = ccac_mails.loc[piano_ind].copy()

In [16]:
payment_words = ["$50/hr","$45/hr","$40/hr","$35/hr","$30/hr","$25/hr","$20/hr","$400 weekly","$350 per week","$350 weekly","$300 weekly","$250 weekly"]
payment_flag_ind =[any(x in str(ccac_mails["Text"][i]) for x in payment_words) for i in range(len(ccac_mails))]
payment_flag_mails = ccac_mails[payment_flag_ind].copy()

In [17]:
staff_words = ["Dear Employee And Staff", "verify your account","received a new message","@ccac sales","versions of our mailbox","moonfruit.com"]
staff_flag_ind =[any(x in str(ccac_mails["Text"][i]) for x in staff_words) for i in range(len(ccac_mails))]
staff_flag_mails = ccac_mails[staff_flag_ind].copy()

In [18]:
ad_words = ["walmart","reward"]
ad_flag_ind =[all(x in str(ccac_mails["Text"][i]).lower() for x in ad_words) for i in range(len(ccac_mails))]
ad_flag_mails = ccac_mails[ad_flag_ind].copy()

In [19]:
ccac_sender_words = ["@ccac sales"]
ccac_sender_flag_ind =[any(x in str(ccac_mails["SenderEmailAddress"][i]).lower() for x in ccac_sender_words) for i in range(len(ccac_mails))]
ccac_sender_flag_mails = ccac_mails[ccac_sender_flag_ind].copy()

In [20]:
subject_words = ["APPLY NOW!!!"]
subject_flag_ind =[any(x in str(ccac_mails["Subject"][i]) for x in subject_words) for i in range(len(ccac_mails))]
subject_flag_mails = ccac_mails[subject_flag_ind].copy()

In [21]:
food_words = ["food"]
food_flag_ind =[any(x in str(ccac_mails["Text"][i]).lower() for x in food_words) for i in range(len(ccac_mails))]
food_flag_mails = ccac_mails[food_flag_ind].copy()

In [22]:
phish_ccac = assist_flag_mails.merge(piano_flag_mails, how="outer").merge(payment_flag_mails, how="outer").merge(staff_flag_mails, how="outer").merge(ad_flag_mails, how="outer").merge(ccac_sender_flag_mails, how="outer").merge(subject_flag_mails, how="outer").merge(food_flag_mails, how="outer")

In [25]:
phish_ccac["Class"] = 1

In [26]:
len(phish_ccac)

536

Keyword counts for Non phishing mails

In [27]:
meeting_words = ["microsoft teams meeting","zoom.us","webex.com", "teams.microsoft.com", "join zoom meeting","do not delete or change", "gartner.com"]
teams_mails_ind =[any(x in str(ccac_mails["Body"][i]).lower() for x in meeting_words) for i in range(len(ccac_mails))]
teams_mails = ccac_mails[teams_mails_ind].copy()

In [28]:
subject_words = ["accepted","canceled","declined","tentative","reminder","confirmation","1 on 1","hold"]
subject_flag_ind =[any(x in str(ccac_mails["Subject"][i]).lower() for x in subject_words) for i in range(len(ccac_mails))]
sub_flag_mails = ccac_mails[subject_flag_ind].copy()

In [29]:
what_words = ["b6b3845926249a034f20cb8c4e628562"]
what_flag_ind =[any(x in str(ccac_mails["Text"][i]).lower() for x in what_words) for i in range(len(ccac_mails))]
what_flag_mails = ccac_mails[what_flag_ind].copy()

In [32]:
print(len(teams_mails))
print(len(food_flag_mails))
print(len(sub_flag_mails))

1620
267
664


In [33]:
nonphish_ccac = teams_mails.merge(sub_flag_mails, how="outer")#.drop(columns=["Id","CC"])

In [34]:
nonphish_ccac["Class"] = 0

In [35]:
nonphish_ccac.head(2)

Unnamed: 0,Id,EmailTo,CC,SenderName,SenderEmailAddress,Subject,Body,Text,Class
0,3,Babur Rais Abolt; Brandon Aristeo Akcali; Osam...,,Suchit Abdullah,AbdullahSuc1912@ccac.sales.com,Canceled: UUDEX Discussion,_________________________________________...,Canceled: UUDEX Discussion ______________...,0
1,4,Hamad A Aivalotis,,Brandon Aristeo Akcali,AkcaliBra3136@ccac.sales.com,check in with Hannah,_______________________________...,check in with Hannah __________...,0


In [36]:
len(nonphish_ccac)

2069

In [37]:
ccac = phish_ccac.merge(nonphish_ccac, how="outer").drop(columns=['Id', 'CC', 'Subject','Body'])

In [38]:
print(len(ccac))

2605


In [39]:
emails.head(2)
# ccac.head(2)

Unnamed: 0,Text,Class,EmailTo,SenderName,SenderEmailAddress
0,Supply Quality China's EXCLUSIVE dimensions at...,1,,Lisa Renee Agresta,AgrestaLis3731@ccac.sales.com
1,over. SidLet me know. Thx.,0,James Akkiris,Brandon Aristeo Akcali,AkcaliBra3136@ccac.sales.com


In [42]:
merged_emails = ccac.copy()

In [43]:
merged_emails = merged_emails[merged_emails["Text"].notna()]

In [44]:
len(merged_emails)

2605

Get valid domains from sender's email address from merged dataset

In [45]:
domainList=[]
for mail in merged_emails["SenderEmailAddress"]:
    try:
        if '@' in mail:
            spl = mail.split('@')
            if len(spl) > 1:
                domainList.append(spl[1])
            else:
                domainList.append("na")
        else:
            domainList.append("na")
    except:
        domainList.append("na")

merged_emails["domain"] = domainList

Store Sender name and email similarity

In [46]:
name_email_sim=[]
for i in range(len(merged_emails)):
    spl = []
    try:
        spl = merged_emails["SenderEmailAddress"][i].split('@')
    except:
        name_email_sim.append(0)
        continue
    email_name = merged_emails["SenderEmailAddress"][i].split('@')[0]
    name_email_sim.append(similar(merged_emails["SenderName"][i], email_name))

merged_emails["name_email_sim"] = name_email_sim

Store Sender email domain and URL in mail similarity

In [47]:
merged_emails["domain_url_sim"] = [domain_url_similarity(str(merged_emails["domain"][i]),str(merged_emails["Text"][i])) for i in range(len(merged_emails))]

Get count of other keywords

In [49]:
money_words=["account","bank","credit","limit","statement","debit","fund","transaction","price","dollars","grants","insurance","$","dollar"]
identity_words=["account","identity","password","user","social","security","member","email"]
access_words=["access","restrict","log","locked","login"]
linker_words=["click","verify","online"]
hook_words = ["inconvenience","update","risk","recently","service","suspension","suspended","confirm","free","win","won","work","closed","easy","opportunity"]
maybe_words=["information","limited","minutes","client","hold","wish"]
meeting_words = ["calender invite", "microsoft teams meeting", "session", "meeting", "meeting id", "zoom.us", "webex.com"]
company_words = ["microsoft", "gartner", "google"]

In [50]:
merged_emails["exclamation_count"] = [str(doc).count('!') for doc in merged_emails.Text]
merged_emails["money_count"] = [sum(str(s).lower().count(x) for x in money_words) for s in merged_emails.Text]
merged_emails["identity_count"] = [sum(str(s).lower().count(x) for x in identity_words) for s in merged_emails.Text]
merged_emails["access_count"] = [sum(str(s).lower().count(x) for x in access_words) for s in merged_emails.Text]
merged_emails["linker_count"] = [sum(str(s).lower().count(x) for x in linker_words) for s in merged_emails.Text]
merged_emails["hook_count"] = [sum(str(s).lower().count(x) for x in hook_words) for s in merged_emails.Text]
merged_emails["maybe_count"] = [sum(str(s).lower().count(x) for x in maybe_words) for s in merged_emails.Text]
merged_emails["meeting_count"] = [sum(str(s).lower().count(x) for x in meeting_words) for s in merged_emails.Text]
merged_emails["company_count"] = [sum(str(s).lower().count(x) for x in company_words) for s in merged_emails.Text]

In [51]:
merged_emails["url_count"] = [count_urls(str(doc)) for doc in merged_emails.Text]

In [None]:
# merged_emails["url_text_sim"] = [text_url_similarity(str(doc)) for doc in merged_emails.Text]

In [54]:
merged_emails.loc[4]

EmailTo                            AksharanugrahaFra2590@ccac.sales.com
SenderName                                      Swanback, Andrew Thomas
SenderEmailAddress                           AbelSco2250@ccac.sales.com
Text                  Administrative Support Opportunity Dr. Sophie ...
Class                                                                 1
domain                                                   ccac.sales.com
name_email_sim                                                 0.176471
domain_url_sim                                                        0
exclamation_count                                                     0
money_count                                                           1
identity_count                                                        1
access_count                                                          0
linker_count                                                          0
hook_count                                                      

In [55]:
# merged_emails = merged_emails.drop(columns=['url_count'])

In [56]:
merged_emails.columns

Index(['EmailTo', 'SenderName', 'SenderEmailAddress', 'Text', 'Class',
       'domain', 'name_email_sim', 'domain_url_sim', 'exclamation_count',
       'money_count', 'identity_count', 'access_count', 'linker_count',
       'hook_count', 'maybe_count', 'meeting_count', 'company_count',
       'url_count'],
      dtype='object')

merged_emails["money_std"] = minmax_scale(merged_emails["money_count"])
merged_emails["identity_std"] = minmax_scale(merged_emails["identity_count"])
merged_emails["url_std"] = minmax_scale(merged_emails["url_count"])
merged_emails["exclamation_std"] = minmax_scale(merged_emails["exclamation_count"])
merged_emails["access_std"] = minmax_scale(merged_emails["access_count"])
merged_emails["linker_std"] = minmax_scale(merged_emails["linker_count"])
merged_emails["hook_std"] = minmax_scale(merged_emails["hook_count"])
merged_emails["maybe_std"] = minmax_scale(merged_emails["maybe_count"])
merged_emails["meeting_std"] = minmax_scale(merged_emails["meeting_count"])
merged_emails["company_std"] = minmax_scale(merged_emails["company_count"])

Randomly drop row from data

In [58]:
# dropped_row_idx = random.choice(merged_emails.index)
# merged_emails = merged_emails.drop(dropped_row_idx)
# y_train = y_train.drop(dropped_row_idx)

In [59]:
len(merged_emails)

2605

Count vectorize the mail text(subject + body)

In [60]:
vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1, stop_words='english')
# words = [" ".join(doc) for doc in tokenizedMails]
x = vectorizer.fit_transform(merged_emails.Text)


Append calculated columns to vectorized dataset

In [61]:
x=hstack(blocks=[x, np.array(merged_emails[['money_std', 'identity_std', 'url_std', 'exclamation_std','access_std', 'linker_std', 'hook_std', 'maybe_std', 'meeting_std','company_std', 'name_email_sim','domain_url_sim']])]).tocsr()
# x = merged_emails[['money_std', 'identity_std', 'exclamation_std','access_std', 'linker_std', 'hook_std', 'maybe_std', 'meeting_std','company_std']]
y = merged_emails.Class

Split into train and test

In [62]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3)

In [63]:
x_train

<1823x321233 sparse matrix of type '<class 'numpy.float64'>'
	with 866138 stored elements in Compressed Sparse Row format>

In [64]:
print(x_train.shape)
print(x_val.shape)

(1823, 321233)
(782, 321233)


<b> Predict using Metalearner</b>

In [65]:
# get models
models = get_models()
# get out of fold predictions
meta_X, meta_y = get_out_of_fold_predictions(x_train, y_train, models, True)
print('Meta ', meta_X.shape, meta_y.shape)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Meta  (1823, 7) (1823,)


<b> Fit super learner and predict accuracy on test </b>

In [66]:
fit_base_models(x, y, models)
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)
# evaluate base models
evaluate_models(x_val, y_val, models)
# evaluate meta model
yhat,yprob = super_learner_predictions(x_val, models, meta_model)
print()
print('Super Learner: Accuracy -  %.3f' %(accuracy_score(y_val, yhat)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression: Accuracy - 0.980
	Train Accuracy :  0.979539641943734
	Val Accuracy :  0.979539641943734
DecisionTreeClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174
MultinomialNB: Accuracy - 0.983
	Train Accuracy :  0.9833759590792839
	Val Accuracy :  0.9833759590792839
AdaBoostClassifier: Accuracy - 0.985
	Train Accuracy :  0.9846547314578005
	Val Accuracy :  0.9846547314578005
BaggingClassifier: Accuracy - 0.983
	Train Accuracy :  0.9833759590792839
	Val Accuracy :  0.9833759590792839
RandomForestClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174
ExtraTreesClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174

Super Learner: Accuracy -  0.983


<b> Print accuracy through each model in stacked ensemble </b>

In [67]:
evaluate_models(x_val, y_val, models)

LogisticRegression: Accuracy - 0.980
	Train Accuracy :  0.979539641943734
	Val Accuracy :  0.979539641943734
DecisionTreeClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174
MultinomialNB: Accuracy - 0.983
	Train Accuracy :  0.9833759590792839
	Val Accuracy :  0.9833759590792839
AdaBoostClassifier: Accuracy - 0.985
	Train Accuracy :  0.9846547314578005
	Val Accuracy :  0.9846547314578005
BaggingClassifier: Accuracy - 0.983
	Train Accuracy :  0.9833759590792839
	Val Accuracy :  0.9833759590792839
RandomForestClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174
ExtraTreesClassifier: Accuracy - 0.978
	Train Accuracy :  0.9782608695652174
	Val Accuracy :  0.9782608695652174


In [68]:
print('Super Learner: Accuracy -  %.3f' %(accuracy_score(y_val, yhat)))

Super Learner: Accuracy -  0.983


<b>Create test dataset for submission using CCAC and all calculated columns</b>

In [70]:
ccac_test = ccac_mails.copy()

In [71]:
# ccac_test["exclamation_count"] = [str(doc).count('!') for doc in ccac_test.Text]
ccac_test["exclamation_count"] = [str(doc).count('!') for doc in ccac_test.Text]
ccac_test["url_count"] = [count_urls(str(doc)) for doc in ccac_test.Text]
ccac_test["money_count"] = [sum(str(s).lower().count(x) for x in money_words) for s in ccac_test.Text]
ccac_test["identity_count"] = [sum(str(s).lower().count(x) for x in identity_words) for s in ccac_test.Text]
ccac_test["access_count"] = [sum(str(s).lower().count(x) for x in access_words) for s in ccac_test.Text]
ccac_test["linker_count"] = [sum(str(s).lower().count(x) for x in linker_words) for s in ccac_test.Text]
ccac_test["hook_count"] = [sum(str(s).lower().count(x) for x in hook_words) for s in ccac_test.Text]
ccac_test["maybe_count"] = [sum(str(s).lower().count(x) for x in maybe_words) for s in ccac_test.Text]
ccac_test["meeting_count"] = [sum(str(s).lower().count(x) for x in meeting_words) for s in ccac_test.Text]
ccac_test["company_count"] = [sum(str(s).lower().count(x) for x in company_words) for s in ccac_test.Text]

ccac_test["money_std"] = minmax_scale(ccac_test["money_count"])
ccac_test["identity_std"] = minmax_scale(ccac_test["identity_count"])
ccac_test["url_std"] = minmax_scale(ccac_test["url_count"])
ccac_test["exclamation_std"] = minmax_scale(ccac_test["exclamation_count"])
ccac_test["access_std"] = minmax_scale(ccac_test["access_count"])
ccac_test["linker_std"] = minmax_scale(ccac_test["linker_count"])
ccac_test["hook_std"] = minmax_scale(ccac_test["hook_count"])
ccac_test["maybe_std"] = minmax_scale(ccac_test["maybe_count"])
ccac_test["meeting_std"] = minmax_scale(ccac_test["meeting_count"])
ccac_test["company_std"] = minmax_scale(ccac_test["company_count"])

In [72]:
domainList=[]
for mail in ccac_test["SenderEmailAddress"]:
    try:
        if '@' in mail:
            spl = mail.split('@')
            if len(spl) > 1:
                domainList.append(spl[1])
            else:
                domainList.append("na")
        else:
            domainList.append("na")
    except:
        domainList.append("na")

ccac_test["domain"] = domainList

In [73]:
name_email_sim=[]
for i in range(len(ccac_test)):
    spl = []
    try:
        spl = ccac_test["SenderEmailAddress"][i].split('@')
    except:
        name_email_sim.append(0)
        continue
    email_name = ccac_test["SenderEmailAddress"][i].split('@')[0]
    name_email_sim.append(similar(ccac_test["SenderName"][i], email_name))

ccac_test["name_email_sim"] = name_email_sim

In [74]:
ccac_test["domain_url_sim"] = [domain_url_similarity(str(ccac_test["domain"][i]),str(ccac_test["Text"][i])) for i in range(len(ccac_test))]

In [75]:
# ccac_test["url_text_sim"] = [text_url_similarity(str(doc)) for doc in ccac_test.Text]

In [76]:
ccac_test.columns

Index(['Id', 'EmailTo', 'CC', 'SenderName', 'SenderEmailAddress', 'Subject',
       'Body', 'Text', 'exclamation_count', 'url_count', 'money_count',
       'identity_count', 'access_count', 'linker_count', 'hook_count',
       'maybe_count', 'meeting_count', 'company_count', 'money_std',
       'identity_std', 'url_std', 'exclamation_std', 'access_std',
       'linker_std', 'hook_std', 'maybe_std', 'meeting_std', 'company_std',
       'domain', 'name_email_sim', 'domain_url_sim'],
      dtype='object')

In [77]:
ccac_x = vectorizer.transform(ccac_test.Text)

In [78]:
ccac_x = hstack(blocks=[ccac_x, np.array(ccac_test[['money_std', 'identity_std', 'url_std', 'exclamation_std','access_std', 'linker_std', 'hook_std', 'maybe_std', 'meeting_std','company_std', 'name_email_sim','domain_url_sim']])]).tocsr()

In [79]:
ccac_x

<4898x321233 sparse matrix of type '<class 'numpy.float64'>'
	with 1659244 stored elements in Compressed Sparse Row format>

In [80]:
# ccac_x = ccac_test[['money_std', 'identity_std', 'exclamation_std','access_std', 'linker_std', 'hook_std', 'maybe_std', 'meeting_std','company_std']]

<b> Predict using each model in stacked ensemble(meta learners + super learner) </b>

In [81]:
for model in models:
#     if model.__class__.__name__ == "RandomForestClassifier":
    pred_selected = []
    file_name =  "sl_ccac_correctedphish_"+str(model.__class__.__name__)+".csv"
    try:
        pred_selected = model.predict_proba(ccac_x)
    except:
        print(model.__class__.__name__)
    np.savetxt(file_name, pred_selected, delimiter=',')

In [82]:
yhat,yprob = super_learner_predictions(ccac_x, models, meta_model)

In [83]:
np.savetxt("superlearner_ccac_correctedphish_.csv", yprob, delimiter=',')

<b> Train and predict using only Gradient Boosting model </b>

In [84]:
gb_model = GradientBoostingClassifier()
gb_model.fit(x_train, y_train)
y_pred = gb_model.predict(x_val)
print(' Accuracy -  %.3f' %(accuracy_score(y_val, y_pred)))
print(gb_model.score(x_train,y_train))
print(gb_model.score(x_val,y_val))
y_gb_proba = gb_model.predict_proba(ccac_x)

# np.savetxt("probs_gb_sampled.csv", y_gb_proba, delimiter=',')

 Accuracy -  0.964
0.9912232583653319
0.9641943734015346


In [85]:
np.savetxt("probs_gb_ccac_correctedphish.csv", y_gb_proba, delimiter=',')

<b> Train and predict using only Naive Bayes model </b>

In [86]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_pred_nb = nb.predict(x_val)
acc_nb = accuracy_score(y_val, y_pred_nb)
print("NB Accuracy: {}%".format(round(acc_nb*100, 2)))
print(nb.score(x_train,y_train))
print(nb.score(x_val,y_val))
ccac_pred_nb_prob = nb.predict_proba(ccac_x)

NB Accuracy: 96.68%
0.9846407021393307
0.9667519181585678


In [87]:
ccac_pred = nb.predict(ccac_x)

In [88]:
np.savetxt("probs_nb_ccac_correctedphish_.csv", ccac_pred_nb_prob, delimiter=',')
# np.savetxt("preds_nb_sub-sampled_gartner_assist_food.csv", ccac_pred, delimiter=',')