In [1]:
import pandas as pd
import numpy as np

import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, make_scorer
from sklearn import tree, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.initializers import VarianceScaling
from sklearn.preprocessing import StandardScaler

In [3]:
with open('../data/interim/new_data_url_all_features.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
text_features = df.select_dtypes(include=['object'])
df = df.drop(columns=text_features)
basic_features = ['status', 'len_url','len_FQDN',
       'len_tld','contains_ip', 'url_entropy', 'FQDN_entropy',
       'tld_entropy', 'url_tld_entropy', 'has_js', 'js_entropy', 'num_pat_3',
       'pat_3_entropy','FQDN_ratio_capital', 'url_len_capital',
       'url_ratio_capital', 'url_char_num_ratio','url_spe_char_ratio','FQDN_char_num_ratio',
       'FQDN_spe_char_ratio','url_obfuscation_status','FQDN_obfuscation_status', 'obfuscation_entropy_url',
       'obfuscation_entropy_FQDN', 'url_repeated_subs_weight',          
       'num_url_reapeated_subs', 'url_num_chunk_weight', 'n_hypens','n_uscores','n_semicolon','n_equal_sign', 'n_plus_sign',
       'num_segments_FQDN', 'FQDN_seg_dot_sep_entropy',         
       'tld_fuzzy_score_lev','tld_fuzzy_score_da_nor','tld_similarity_jaro_winkler','dot_probability',
       'sum_spe_prob', 'similarity_index_legit', 'similarity_index_phi',
       'english_word_count_url', 'english_word_count_FQDN']
df = df[basic_features]

In [7]:
# Separate phishing (status=0) and legitimate (status=1) samples
df_legit = df[df['status'] == 1]  # Legitimate URLs
df_phish = df[df['status'] == 0]  # Phishing URLs

# Reduce size to 1/10 while keeping balance
df_legit_sample, _ = train_test_split(df_legit, train_size=.9, random_state=42, stratify=df_legit['status'])
df_phish_sample, _ = train_test_split(df_phish, train_size=.9, random_state=42, stratify=df_phish['status'])
# Combine the downsampled data
df = pd.concat([df_legit_sample, df_phish_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print(df['status'].value_counts())

status
1    383990
0    342859
Name: count, dtype: int64


In [9]:
predictors = df.drop(columns=['status'])
target = to_categorical(df['status'])
predictors = predictors.to_numpy()
target = to_categorical(df['status'])

model = Sequential()
n_cols = predictors.shape[1]
input_shape = (n_cols,)
model.add(Dense(1000, activation='relu', input_shape=input_shape))
model.add(Dense(1000, activation='relu', input_shape=input_shape))


model.add(Dense(1000, activation='relu', input_shape=input_shape))
model.add(Dense(1000, activation='relu', input_shape=input_shape))
model.add(Dense(1000, activation='relu', input_shape=input_shape))
model.add(Dense(2, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, validation_split=0.3, epochs=400)

Epoch 1/400
Epoch 2/400

In [None]:

# early_stopping_monitor = EarlyStopping(patience=5)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, validation_split=0.3, epochs=30)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, validation_split=0.3, epochs=30, callbacks=[early_stopping_monitor])

In [None]:
scaler = StandardScaler()
predictors = scaler.fit_transform(predictors)

In [None]:
model = Sequential()

# Adding layers with variance scaling initialization
model.add(Dense(500, activation='relu', input_shape=input_shape, kernel_initializer=VarianceScaling()))
model.add(Dense(500, activation='relu', kernel_initializer=VarianceScaling()))
model.add(Dense(500, activation='relu', kernel_initializer=VarianceScaling()))
model.add(Dense(500, activation='relu', kernel_initializer=VarianceScaling()))

# Output layer (no activation for binary classification)
model.add(Dense(1))  # No activation, using binary_crossentropy

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Set up early stopping to avoid overfitting
early_stopping_monitor = EarlyStopping(patience=3)

# Train model
model.fit(predictors, target, validation_split=0.3, epochs=30, callbacks=[early_stopping_monitor])