# Data Preprocessing & Feature Engineering

In [2]:
import pandas as pd
# Import the dataset
training = pd.read_csv('./Training Data/Phishing_Mitre_Dataset_Summer_of_AI.csv')
# Split the urls by /
split = training['URL'].str.split('/', expand=True)
# Drop the first column since there was a double slash
split.drop([1], axis=1, inplace=True)
# get rid of the 'www.'
split[2] = split[2].map(lambda x: x.lstrip('www.'))
# Create a column with the number of '.' in the url
training['num_domain_periods'] = split[2].str.count('\.')
# Create a column with the total length of the url
training['domain_length'] = split[2].str.replace('\.', '', regex=True).str.len()
# Create a column with the number of terms in the domain
training['num_domain_terms'] = split[2].str.split('\.').str.len()
# Create a blacklist of sensitive words
sensitive_words = ['confirm' 'account',
'bank', 'secure', 'login', 'signin', 'register', 'update', 'sign-in', 'verify']
# Join all of the words in the blacklist with '|'
sensitive = '|'.join(sensitive_words)
# Create a column of whether a given url contains sensitive words
training['Has_Sensitive_words'] = 0
training.loc[training.URL.str.contains(sensitive), 'Has_Sensitive_words'] = 1
# Create a column of whether a given url contains an IP address
training['Has_IP'] = 0
training.loc[training.URL.str.contains('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'), 'Has_IP'] = 1
# Create a column that contains the number of periods in the url not including the last three
training['Num_Periods'] = training['URL'].str.count('\.')-3
# Create a blacklist of sensitive characters
suspicious = ['-', '@', '%']
# Join all of the words in the blacklist with '|'
suspicious_char = '|'.join(suspicious)
# Create a column of whether a given url contains suspicious characters
training['Has_sus_char'] = training.URL.str.replace(r':|\.|/', '', regex=True).str.contains(suspicious_char)
training['Has_sus_char'] = training['Has_sus_char'].astype(int)
# Create a column for the length of the URL
training['URL_Length'] = training.URL.str.len()
# Create a column with the number of the slashes in the URL
training['num_slashes'] = training.URL.str.count('/')
# Create a blacklist for suspicious files
files_list = ['.php','.exe','.py','.doc', '.js', '.vb', '.pdf', '.bat', '.dll', '.tmp', '.msi', '.msp', '.ps[12c]', '.lnk', '.inf', 'cmd', 'asp', 'jsp', 'cgi']
# Join all of the words in the blacklist with '|'
files = '|'.join(files_list)
# Create a column of whether a given url contains suspicious_files
training['sus_files'] = 0
training.loc[training.URL.str.contains(files, case=False), 'sus_files'] = 1
# Reorder columns for future column indexing purposes
cols_at_end = ['Label']
training = training[[c for c in training if c not in cols_at_end] 
        + [c for c in cols_at_end if c in training]]
training

Unnamed: 0,create_age(months),expiry_age(months),update_age(days),URL,num_domain_periods,domain_length,num_domain_terms,Has_Sensitive_words,Has_IP,Num_Periods,Has_sus_char,URL_Length,num_slashes,sus_files,Label
0,-1,-1,-1,http://account-google-com.ngate.my/c44cca40176...,2,25,3,0,0,2,1,70,3,0,1
1,212,16,663,http://www.coffeespecialties.com/...,1,20,2,0,0,2,0,36,3,0,0
2,-1,-1,-1,http://black.pk/wp-content/2013/04/bp.postale/...,1,7,2,0,0,2,1,73,7,0,1
3,198,6,186,http://atomicsoda.com/manutd...,1,13,2,0,0,1,0,31,3,0,0
4,240,24,1684,http://bostoncoffeecake.com/...,1,19,2,0,0,1,0,31,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4794,52,8,118,http://aridfoods.com/V4/MGen/F97a8a294cf7c5e90...,1,12,2,0,0,1,0,73,6,0,1
4795,-1,-1,-1,http://www.mazda.co.jp/...,2,9,3,0,0,3,0,26,3,0,0
4796,-1,-1,-1,http://www.fotografaemsaopaulo.com.br/wp-admin...,2,24,3,0,0,3,1,73,6,1,1
4797,-1,-1,-1,http://agenda.wehrensarl.ch/libraries/joomla/h...,2,18,3,0,0,2,0,73,8,0,1


# Cross Validation and Hyper-parameter Tuning

In [3]:
#splitting training and testing data
import scipy
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler as SScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTClf
from sklearn.ensemble import RandomForestClassifier as rfClfs
from sklearn.neighbors import KNeighborsClassifier as KNClf
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import ParameterGrid as PGrid
from sklearn.svm import SVC as SvmClf
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier as MlpClf
from xgboost import XGBClassifier as GBClf
import time
train, test = train_test_split(training,  test_size=0.2, random_state=42)
url_train = train[['Label', 'URL']]
url_test = test[['Label', 'URL']]
url_train['Label'] = url_train['Label'].apply(lambda x: "+1" if x == 1 else "-1")
url_test['Label'] = url_test['Label'].apply(lambda x: "+1" if x == 1 else "-1")
url_train.to_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\urlnet_training.txt", sep="\t", header=False, index=False)
url_test.to_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\urlnet_test.txt", sep="\t", header=False, index=False)

add_training = pd.read_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\runs\phishing_emb3_dlm0_32dim_minwf1_1conv3456_5ep\training_output.txt", delimiter='\t')['predict']
add_test = pd.read_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\runs\phishing_emb3_dlm0_32dim_minwf1_1conv3456_5ep\test_output.txt", delimiter='\t')['predict']
train.reset_index(inplace=True)
test.reset_index(inplace=True)
train = pd.concat([train, add_training], axis=1)
test = pd.concat([test, add_test], axis=1)
train['URLNet_Prediction'] = train['predict'].apply(lambda x: 1 if x == 1 else 0)
test['URLNet_Prediction'] = test['predict'].apply(lambda x: 1 if x == 1 else 0)
train.drop(columns=['predict'], inplace=True)
test.drop(columns=['predict'], inplace=True)
# Define the target column
y_train = train["Label"]
y_test = test["Label"]
x_train = train.drop(columns=["Label", 'URL'])
x_test = test.drop(columns=["Label", 'URL'])
x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  url_train['Label'] = url_train['Label'].apply(lambda x: "+1" if x == 1 else "-1")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  url_test['Label'] = url_test['Label'].apply(lambda x: "+1" if x == 1 else "-1")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\bmoskowitz\\OneDrive - The MITRE Corporation\\Desktop\\malicious_urls\\urlnet_training.txt'

In [36]:
### define parameters:
param_grid_all = {}
# Define the parameters for the logistic regression model
# param_grid_all['logistic_regression'] = PGrid({'C' : [.01, .5, 1, 5, 10, 100, 1000], 
#                                               'max_iter' : [25000]})
# # Define the parameters for the decision tree model
# param_grid_all['decision_tree'] = PGrid({'max_depth' : [1, 2, 4, 8], 
#                                          'min_samples_split' : [2, 30, 100]})
# # Define the parameters for the random forest model
# param_grid_all['random_forest'] = PGrid({'n_estimators': [10,50,100,200,500],
#                                         'max_depth' : [1,2,4,8], 'min_samples_split' : [2, 30, 100]})
# # Define the parameters for the gradient boosing model
# param_grid_all['gradient_boosting'] = PGrid({'learning_rate' : [0.1, 0.2, 0.5],
#                                          'n_estimators': [50,100,200],
#                                          'max_depth' : [6,4,8], 'min_child_weight' : [1,3,5,20,30,100],
#                                              'use_label_encoder':[False], 'eval_metric': ['auc']})
# # Define the parameters for the k-nearest neighbors model
# param_grid_all['k-nearest_neighbors'] = PGrid({'n_neighbors' : [4,5,6,7,8,9,10], 'weights' : ['uniform']})

# # Define the parameters for the support vector machine model
# param_grid_all['support_vector_machine'] = PGrid({'C' : [.01, .5, 1, 5, 10, 100, 1000], 
#                                                  'probability' : [True]})

# Define the parameters for the MLP neural network model
param_grid_all['mlp_neural_network'] = PGrid({'hidden_layer_sizes' : [20,50,100,150],
                                              'solver' : ['lbfgs', 'adam'], 'max_iter' : [50000]})

# Define the classes for the models
models = {
        #  'logistic_regression' : LR,
        #  'decision_tree' : DTClf,
        #  'random_forest' : rfClfs,
        #  'gradient_boosting' : GBClf,
        #  'k-nearest_neighbors' : KNClf,
        #  'support_vector_machine' : SvmClf,
         'mlp_neural_network' : MlpClf
}


In [37]:
'''The following code performs 5 fold cross validation and collects data about the optimal 
hyper-parameter combinations.'''

outer_cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
inner_cv = StratifiedKFold(n_splits=5, random_state=2, shuffle=True)

transform_feature_names = x_train.filter(regex='length|num|age', axis=1).columns
feature_names = x_train.columns

results_dict = {}
for model_name, the_model in models.items():
    print(f'Processing {model_name}')
    start = time.time()
    results_dict[model_name] = []
    fold = 0
    for train_ix, test_ix in outer_cv.split(X=x_train, y=y_train):
        fold += 1
        X_train = train.iloc[train_ix][feature_names].copy()
        X_test = train.iloc[test_ix][feature_names].copy()
        Y_train = train.iloc[train_ix]['Label']
        Y_test = train.iloc[test_ix]['Label']

        ss = SScaler()
        ss = ss.fit(X_train[transform_feature_names])
        X_train.loc[:, transform_feature_names] = ss.transform(X_train[transform_feature_names])
        X_test.loc[:, transform_feature_names] = ss.transform(X_test[transform_feature_names])

        for pg in param_grid_all[model_name]:
            estimator = the_model(**pg)                
            estimator = estimator.fit(X_train, Y_train)
            auc_score = auc(Y_test, estimator.predict_proba(X_test)[:,1])
            results_dict[model_name].append(pd.DataFrame([pg]))
            results_dict[model_name][-1]['fold'] = fold
            results_dict[model_name][-1]['auc'] = auc_score
    print(f'Processed {model_name} in {time.time()-start} seconds.')

Processing mlp_neural_network


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION

Processed mlp_neural_network in 172.19063472747803 seconds.


# Decision Tree Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [19]:
decision_tree_results = pd.concat(results_dict['decision_tree'])
decision_tree_results.sort_values(by='auc', ascending=False)

Unnamed: 0,max_depth,min_samples_split,fold,auc
0,4,100,5,0.992103
0,4,30,5,0.992097
0,8,30,5,0.990099
0,8,100,4,0.990058
0,8,100,5,0.989679
0,4,2,5,0.989378
0,2,2,5,0.988181
0,2,30,5,0.988181
0,2,100,5,0.988181
0,4,30,4,0.987472


In [20]:
dtclf = DTClf(random_state=0, max_depth=4, min_samples_split=100).fit(x_train, y_train)
labeled_pred_dtclf = dtclf.predict(x_test)
print('Decision Tree Model')
print(f'F1 Score: {f1(y_test, labeled_pred_dtclf)}')
print(f'AUC Score: {auc(y_test, labeled_pred_dtclf)}')

Decision Tree Model
F1 Score: 0.9564315352697096
AUC Score: 0.9563473459394697


# Logistic Regession Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [21]:
logistic_regression_results = pd.concat(results_dict['logistic_regression'])
logistic_regression_results.sort_values(by='auc', ascending=False)

Unnamed: 0,C,max_iter,fold,auc
0,5.0,25000,5,0.994644
0,1000.0,25000,5,0.994638
0,0.5,25000,5,0.994624
0,10.0,25000,5,0.994542
0,1.0,25000,5,0.994508
0,100.0,25000,5,0.994494
0,10.0,25000,3,0.991437
0,5.0,25000,3,0.991437
0,1000.0,25000,4,0.991416
0,100.0,25000,4,0.991368


In [22]:
clf = LR(random_state=0, max_iter=25000, C=5.0).fit(x_train, y_train)
labeled_pred_lr = clf.predict(x_test)
print('Logistic Regression Model')
print(f'F1 Score: {f1(y_test, labeled_pred_lr)}')
print(f'AUC Score: {auc(y_test, labeled_pred_lr)}')

Logistic Regression Model
F1 Score: 0.9507853403141361
AUC Score: 0.9512554044902849


# Random Forest Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [23]:
random_forest_results = pd.concat(results_dict['random_forest'])
random_forest_results.sort_values(by='auc', ascending=False)

Unnamed: 0,max_depth,min_samples_split,n_estimators,fold,auc
0,8,2,500,3,0.996732
0,8,2,100,3,0.996657
0,8,30,200,3,0.996622
0,8,30,500,3,0.996541
0,8,30,100,3,0.996452
...,...,...,...,...,...
0,1,100,50,5,0.977097
0,1,100,10,4,0.975016
0,1,30,50,1,0.972185
0,1,2,10,5,0.971950


In [24]:
rfClf = rfClfs(random_state=0, max_depth=8, n_estimators=500, min_samples_split=2).fit(x_train, y_train)
labeled_pred_rfClfs = rfClf.predict(x_test)
print('Random Forest Model')
print(f'F1 Score: {f1(y_test, labeled_pred_rfClfs)}')
print(f'AUC Score: {auc(y_test, labeled_pred_rfClfs)}')

Random Forest Model
F1 Score: 0.9570680628272251
AUC Score: 0.9575063812053966


# Gradient Boosting Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [25]:
gradient_boosting_results = pd.concat(results_dict['gradient_boosting'])
gradient_boosting_results.sort_values(by='auc', ascending=False).head(30)

Unnamed: 0,eval_metric,learning_rate,max_depth,min_child_weight,n_estimators,use_label_encoder,fold,auc
0,auc,0.1,4,3,50,False,5,0.995999
0,auc,0.1,4,1,50,False,5,0.995893
0,auc,0.1,4,5,50,False,5,0.995838
0,auc,0.2,4,30,200,False,5,0.9958
0,auc,0.2,6,30,200,False,5,0.9958
0,auc,0.2,8,30,200,False,5,0.9958
0,auc,0.5,4,30,100,False,5,0.99578
0,auc,0.5,6,30,100,False,5,0.99578
0,auc,0.5,8,30,100,False,5,0.99578
0,auc,0.5,4,30,50,False,5,0.995749


In [26]:
gbclf = GBClf(random_state=0, max_depth=4, n_estimators= 50, min_child_weight=3, learning_rate=0.100,
             use_label_encoder=False, eval_metric='auc').fit(x_train, y_train)
labeled_pred_gbclf = gbclf.predict(x_test)
print('Gradient Boosting Model')
print(f'F1 Score: {f1(y_test, labeled_pred_gbclf)}')
print(f'AUC Score: {auc(y_test, labeled_pred_gbclf)}')

Gradient Boosting Model
F1 Score: 0.9637305699481866
AUC Score: 0.9636271292389436


# K-Nearest Neighbors Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [27]:
knn_results = pd.concat(results_dict['k-nearest_neighbors'])
knn_results.sort_values(by='auc', ascending=False).head(30)

Unnamed: 0,n_neighbors,weights,fold,auc
0,10,uniform,2,0.884242
0,8,uniform,2,0.88225
0,9,uniform,2,0.880653
0,7,uniform,2,0.876392
0,6,uniform,2,0.873983
0,5,uniform,2,0.869135
0,10,uniform,5,0.864287
0,8,uniform,5,0.863942
0,9,uniform,5,0.863689
0,7,uniform,5,0.861996


In [28]:
knn = KNClf(n_neighbors=10, weights='uniform').fit(x_train, y_train)
labeled_pred_knn = knn.predict(x_test)
print('K-Nearest Neighbors Model')
print(f'F1 Score: {f1(y_test, labeled_pred_knn)}')
print(f'AUC Score: {auc(y_test, labeled_pred_knn)}')

K-Nearest Neighbors Model
F1 Score: 0.8301507537688442
AUC Score: 0.8236312965567537


# Support Vector Machine Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [29]:
svm_results = pd.concat(results_dict['support_vector_machine'])
svm_results.sort_values(by='auc', ascending=False)

Unnamed: 0,C,probability,fold,auc
0,1000.0,True,2,0.948347
0,1000.0,True,5,0.934454
0,1000.0,True,3,0.929638
0,1000.0,True,1,0.926485
0,100.0,True,2,0.921962
0,10.0,True,2,0.912054
0,0.5,True,2,0.907691
0,5.0,True,2,0.907633
0,1.0,True,2,0.907629
0,100.0,True,5,0.906677


In [30]:
svm = SvmClf(C=1000.0, probability=True).fit(x_train, y_train)
labeled_pred_svm = svm.predict(x_test)
print('Support Vector Machine Model')
print(f'F1 Score: {f1(y_test, labeled_pred_svm)}')
print(f'AUC Score: {auc(y_test, labeled_pred_svm)}')

Support Vector Machine Model
F1 Score: 0.9041095890410958
AUC Score: 0.8972495702453509


# MLP Neural Network Results

### Display K-fold Cross Validation for Hyper-Parameter Tuning

In [38]:
mlp_nn_results = pd.concat(results_dict['mlp_neural_network'])
mlp_nn_results.sort_values(by='auc', ascending=False)

Unnamed: 0,hidden_layer_sizes,max_iter,solver,fold,auc
0,50,50000,lbfgs,5,0.993954
0,50,50000,adam,5,0.99262
0,100,50000,lbfgs,5,0.992428
0,100,50000,lbfgs,4,0.991444
0,20,50000,adam,5,0.990903
0,100,50000,lbfgs,3,0.990413
0,150,50000,lbfgs,3,0.990086
0,20,50000,adam,3,0.990052
0,100,50000,adam,2,0.989676
0,50,50000,lbfgs,4,0.989622


In [39]:
mlp_nn = MlpClf(hidden_layer_sizes=50, solver='lbfgs', max_iter=50000).fit(x_train, y_train)
labeled_pred_mlp_nn = mlp_nn.predict(x_test)
print('MLP Neural Network Model')
print(f'F1 Score: {f1(y_test, labeled_pred_mlp_nn)}')
print(f'AUC Score: {auc(y_test, labeled_pred_mlp_nn)}')

MLP Neural Network Model
F1 Score: 0.9572471324296142
AUC Score: 0.9574542897327708


# Assess Model Performance

In [40]:
# Create a DataFrame with the performace metrics for all models
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
predictions = [labeled_pred_dtclf, labeled_pred_lr, labeled_pred_rfClfs, labeled_pred_gbclf, labeled_pred_knn, \
              labeled_pred_svm, labeled_pred_mlp_nn]
methods = ['Decision Tree', 'Logistic Regression', 'Random Forest', 'Gradient Boosting', 'K-Nearest Neighbors', \
          'Support Vector Machine', 'MLP Neural Network']
performance = pd.DataFrame({'Accuracy' : [accuracy(y_test, m) for m in predictions],
                            'Precision' : [precision(y_test, m) for m in predictions],
                            'Recall' : [recall(y_test, m) for m in predictions],
                            'AUC' : [auc(y_test, m) for m in predictions], 
                            'F1' : [f1(y_test, m) for m in predictions]}, index=methods)
performance.sort_values(by='F1', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,AUC,F1
Gradient Boosting,0.963542,0.970772,0.95679,0.963627,0.963731
MLP Neural Network,0.957292,0.970402,0.944444,0.957454,0.957247
Random Forest,0.957292,0.974414,0.940329,0.957506,0.957068
Decision Tree,0.95625,0.964435,0.94856,0.956347,0.956432
Logistic Regression,0.951042,0.968017,0.934156,0.951255,0.950785
Support Vector Machine,0.897917,0.86194,0.950617,0.89725,0.90411
K-Nearest Neighbors,0.823958,0.811395,0.849794,0.823631,0.830151


In [35]:
importances = rfClf.feature_importances_
print(x_train.columns)
importances

Index(['index', 'create_age(months)', 'expiry_age(months)', 'update_age(days)',
       'num_domain_periods', 'domain_length', 'num_domain_terms',
       'Has_Sensitive_words', 'Has_IP', 'Num_Periods', 'Has_sus_char',
       'URL_Length', 'num_slashes', 'sus_files', 'URLNet_Prediction'],
      dtype='object')


array([0.00828529, 0.15278712, 0.02534235, 0.01977681, 0.00973914,
       0.01587744, 0.0092172 , 0.00935408, 0.00057187, 0.01179319,
       0.00971601, 0.14361503, 0.08758353, 0.02254088, 0.47380005])