In [2]:
import pandas as pd

def data_preprocess(path):
    # Import the dataset
    data = pd.read_csv(path)
    # Split the urls by /
    split = data['URL'].str.split('/', expand=True)
    # Drop the first column since there was a double slash
    split.drop([1], axis=1, inplace=True)
    # get rid of the 'www.'
    split[2] = split[2].map(lambda x: x.lstrip('www.'))
    # Create a column with the number of '.' in the url
    data['num_domain_periods'] = split[2].str.count('\.')
    # Create a column with the total length of the url
    data['domain_length'] = split[2].str.replace('\.', '', regex=True).str.len()
    # Create a column with the number of terms in the domain
    data['num_domain_terms'] = split[2].str.split('\.').str.len()
    # Create a blacklist of sensitive words
    sensitive_words = ['confirm' 'account',
    'bank', 'secure', 'login', 'signin', 'register', 'update', 'sign-in', 'verify']
    # Join all of the words in the blacklist with '|'
    sensitive = '|'.join(sensitive_words)
    # Create a column of whether a given url contains sensitive words
    data['Has_Sensitive_words'] = 0
    data.loc[data.URL.str.contains(sensitive), 'Has_Sensitive_words'] = 1
    # Create a column of whether a given url contains an IP address
    data['Has_IP'] = 0
    data.loc[data.URL.str.contains('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'), 'Has_IP'] = 1
    # Create a column that contains the number of periods in the url not including the last three
    data['Num_Periods'] = data['URL'].str.count('\.')-3
    # Create a blacklist of sensitive characters
    suspicious = ['-', '@', '%']
    # Join all of the words in the blacklist with '|'
    suspicious_char = '|'.join(suspicious)
    # Create a column of whether a given url contains suspicious characters
    data['Has_sus_char'] = data.URL.str.replace(r':|\.|/', '', regex=True).str.contains(suspicious_char)
    data['Has_sus_char'] = data['Has_sus_char'].astype(int)
    # Create a column for the length of the URL
    data['URL_Length'] = data.URL.str.len()
    # Create a column with the number of the slashes in the URL
    data['num_slashes'] = data.URL.str.count('/')
    # Create a blacklist for suspicious files
    files_list = ['.php','.exe','.py','.doc', '.js', '.vb', '.pdf', '.bat', '.dll', '.tmp', '.msi', '.msp', '.ps[12c]', '.lnk', '.inf', 'cmd', 'asp', 'jsp', 'cgi']
    # Join all of the words in the blacklist with '|'
    files = '|'.join(files_list)
    # Create a column of whether a given url contains suspicious_files
    data['sus_files'] = 0
    data.loc[data.URL.str.contains(files, case=False), 'sus_files'] = 1
    # Reorder columns for future column indexing purposes
    cols_at_end = ['Label']
    data = data[[c for c in data if c not in cols_at_end] 
            + [c for c in cols_at_end if c in data]]
    return data

In [3]:
#splitting training and testing data
import scipy
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler as SScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTClf
from sklearn.ensemble import RandomForestClassifier as rfClfs
from sklearn.neighbors import KNeighborsClassifier as KNClf
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import ParameterGrid as PGrid
from sklearn.svm import SVC as SvmClf
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier as MlpClf
from xgboost import XGBClassifier as GBClf
import time
train = data_preprocess('./Training Data/Phishing_Mitre_Dataset_Summer_of_AI.csv')
train['Label'] = train['Label'].apply(lambda x: "+1" if x == 1 else "-1")
train = train[['Label', 'URL']]
train.to_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\urlnet_training.txt", sep="\t", header=False, index=False)

In [None]:
add_training = pd.read_csv(r"C:\Users\bmoskowitz\OneDrive - The MITRE Corporation\Desktop\malicious_urls\runs\phishing_emb3_dlm0_32dim_minwf1_1conv3456_5ep\training_output.txt", delimiter='\t')['predict']
train.reset_index(inplace=True)
train = pd.concat([train, add_training], axis=1)
train['URLNet_Prediction'] = train['predict'].apply(lambda x: 1 if x == 1 else 0)
train.drop(columns=['predict'], inplace=True)
# Define the target column
y_train = train["Label"]
y_test = test["Label"]
x_train = train.drop(columns=["Label", 'URL'])
x_test = test.drop(columns=["Label", 'URL'])
x_train

In [7]:
#splitting training and testing data
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier as GBClf

# Load the training data
training = data_preprocess('./Training Data/Phishing_Mitre_Dataset_Summer_of_AI.csv')
# Define the target column
y_cols = training["Label"]
# Define the features, exclusing the target and URL 
x_cols = training.drop(columns=["Label", 'URL'])
# Split the data into a 80% training - 20% test split

gbclf = GBClf(random_state=0, max_depth=6, n_estimators= 100, min_child_weight=1, learning_rate=0.200,
             use_label_encoder=False, eval_metric='auc').fit(x_cols, y_cols)

# Load the test set
testing_path = ''
testing = data_preprocess(path)

# Get the x and y columns
test_x_col = testing.drop(columns=["Label", 'URL'])
test_y_col = testing["Label"]

# Predict on the testing set
labeled_pred_gbclf = gbclf.predict(test_x_col)

# Assess model performance
print('Gradient Boosting Model')
print(f'F1 Score: {f1(test_y_col, labeled_pred_gbclf)}')
print(f'AUC Score: {auc(test_y_col, labeled_pred_gbclf)}')

NameError: name 'path' is not defined