Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('Mendeley_111.csv')
print (df.phishing.value_counts())
df.head()

phishing
0    58000
1    30647
Name: count, dtype: int64


Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


Data pre-processing

In [3]:
Label = LabelEncoder()
df['phishing'] = Label.fit_transform(df['phishing'])
print (df.phishing.value_counts())
df.head()

phishing
0    58000
1    30647
Name: count, dtype: int64


Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


Data Cleaning

In [4]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [5]:
input = df.drop (['phishing'], axis='columns')
target = df.phishing
print(target.head())
input.head()

0    1
1    1
2    0
3    1
4    0
Name: phishing, dtype: int64


Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened
0,3,0,0,1,0,0,0,0,0,0,...,-1,1,2,0,892,0,0,0,0,0
1,5,0,1,3,0,3,0,2,0,0,...,150,1,2,1,9540,1,0,0,0,0
2,2,0,0,1,0,0,0,0,0,0,...,-1,1,2,3,589,1,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,-1,1,2,0,292,1,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,306,1,2,1,3597,0,1,0,0,0


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

17730
70917


In [7]:
y_train.value_counts()

phishing
0    46473
1    24444
Name: count, dtype: int64

In [8]:
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)
# y_train.value_counts()

In [9]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [10]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93     11527
           1       0.86      0.89      0.88      6203

    accuracy                           0.91     17730
   macro avg       0.90      0.91      0.90     17730
weighted avg       0.91      0.91      0.91     17730

Accuracy: 0.9114495205865765
Precision: 0.9114495205865765
Recall: 0.9114495205865765
F1-Score: 0.9114495205865765
AUC Score: 0.9069178656742392
MSE: 0.08855047941342357
G-Mean 0.9067922953102974
Kappa: 0.806985729656708
MCC: 0.8073029354378816
Confusion Matrix:
 [[10628   899]
 [  671  5532]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [11]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97     11527
           1       0.94      0.93      0.94      6203

    accuracy                           0.95     17730
   macro avg       0.95      0.95      0.95     17730
weighted avg       0.95      0.95      0.95     17730

Accuracy: 0.9548223350253807
Precision: 0.9548223350253807
Recall: 0.9548223350253807
F1-Score: 0.9548223350253807
AUC Score: 0.9496934567449258
MSE: 0.04517766497461929
G-Mean 0.9495398508636298
Kappa: 0.9005603804270036
MCC: 0.9005688816019805
Confusion Matrix:
 [[11144   383]
 [  418  5785]]


Naive Bayes

In [12]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.96      0.88     11527
           1       0.89      0.61      0.73      6203

    accuracy                           0.84     17730
   macro avg       0.85      0.79      0.81     17730
weighted avg       0.84      0.84      0.83     17730

Accuracy: 0.8378454596728708
Precision: 0.8378454596728708
Recall: 0.8378454596728708
F1-Score: 0.8378454596728708
AUC Score: 0.7862392581822313
MSE: 0.16215454032712917
G-Mean 0.7672265728831383
Kappa: 0.6162783885865605
MCC: 0.6373121716570057
Confusion Matrix:
 [[11044   483]
 [ 2392  3811]]


Random Forest

In [13]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     11527
           1       0.96      0.96      0.96      6203

    accuracy                           0.97     17730
   macro avg       0.97      0.97      0.97     17730
weighted avg       0.97      0.97      0.97     17730

Accuracy: 0.9703891708967851
Precision: 0.9703891708967851
Recall: 0.9703891708967851
F1-Score: 0.9703891708967851
AUC Score: 0.9675476473861613
MSE: 0.02961082910321489
G-Mean 0.967501371801563
Kappa: 0.9349212604056942
MCC: 0.9349214399802636
Confusion Matrix:
 [[11262   265]
 [  260  5943]]


LightBoost

In [14]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 24444, number of negative: 46473
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2999
[LightGBM] [Info] Number of data points in the train set: 70917, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344685 -> initscore=-0.642487
[LightGBM] [Info] Start training from score -0.642487
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97     11527
           1       0.95      0.95      0.95      6203

    accuracy                           0.97     17730
   macro avg       0.96      0.96      0.96     17730
weighted avg       0.97      0.97      0.97     17730

Accuracy: 0.9653694303440497
Precision: 0.9653694303440497
Recall: 0.9653694303440497
F1-Score: 0.9653694303440497
AUC Score: 0.9619001185435687
MSE: 0.03463056965595036
G-Mean 0.9618307306450264

CatBoost

In [15]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.063563
0:	learn: 0.5728588	total: 193ms	remaining: 3m 12s
1:	learn: 0.4913475	total: 232ms	remaining: 1m 55s
2:	learn: 0.4231817	total: 272ms	remaining: 1m 30s
3:	learn: 0.3724401	total: 311ms	remaining: 1m 17s
4:	learn: 0.3353419	total: 348ms	remaining: 1m 9s
5:	learn: 0.3060701	total: 393ms	remaining: 1m 5s
6:	learn: 0.2843189	total: 443ms	remaining: 1m 2s
7:	learn: 0.2654632	total: 484ms	remaining: 60s
8:	learn: 0.2516939	total: 518ms	remaining: 57.1s
9:	learn: 0.2398410	total: 546ms	remaining: 54s
10:	learn: 0.2296271	total: 576ms	remaining: 51.8s
11:	learn: 0.2230025	total: 604ms	remaining: 49.7s
12:	learn: 0.2161637	total: 633ms	remaining: 48.1s
13:	learn: 0.2105953	total: 661ms	remaining: 46.5s
14:	learn: 0.2057802	total: 692ms	remaining: 45.5s
15:	learn: 0.2000349	total: 732ms	remaining: 45s
16:	learn: 0.1949914	total: 765ms	remaining: 44.2s
17:	learn: 0.1909314	total: 814ms	remaining: 44.4s
18:	learn: 0.1884265	total: 849ms	remaining: 43.8s
19:	learn: 0.

Gradient Boost

In [16]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96     11527
           1       0.93      0.93      0.93      6203

    accuracy                           0.95     17730
   macro avg       0.95      0.95      0.95     17730
weighted avg       0.95      0.95      0.95     17730

Accuracy: 0.9518894529046813
Precision: 0.9518894529046813
Recall: 0.9518894529046813
F1-Score: 0.9518894529046813
AUC Score: 0.9471400449730197
MSE: 0.048110547095318666
G-Mean 0.9470079742942717
Kappa: 0.894246797372305
MCC: 0.8942468042448309
Confusion Matrix:
 [[11100   427]
 [  426  5777]]


SVM

In [17]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.86      0.82     11527
           1       0.68      0.57      0.62      6203

    accuracy                           0.75     17730
   macro avg       0.73      0.71      0.72     17730
weighted avg       0.75      0.75      0.75     17730

Accuracy: 0.7548223350253808
Precision: 0.7548223350253808
Recall: 0.7548223350253808
F1-Score: 0.7548223350253807
AUC Score: 0.7116297728869918
MSE: 0.2451776649746193
G-Mean 0.6969411667119438
Kappa: 0.4398818832270923
MCC: 0.4436928362566817
Confusion Matrix:
 [[9861 1666]
 [2681 3522]]


KNN

In [18]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.91      0.90     11527
           1       0.83      0.80      0.82      6203

    accuracy                           0.87     17730
   macro avg       0.86      0.86      0.86     17730
weighted avg       0.87      0.87      0.87     17730

Accuracy: 0.8741116751269036
Precision: 0.8741116751269036
Recall: 0.8741116751269036
F1-Score: 0.8741116751269036
AUC Score: 0.8576890897610235
MSE: 0.12588832487309645
G-Mean 0.8559436426868952
Kappa: 0.7210693668862975
MCC: 0.721322559781017
Confusion Matrix:
 [[10517  1010]
 [ 1222  4981]]


Deep Learning

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [20]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93     11527
           1       0.84      0.91      0.88      6203

    accuracy                           0.91     17730
   macro avg       0.90      0.91      0.90     17730
weighted avg       0.91      0.91      0.91     17730

Accuracy: 0.9086294416243654
Precision: 0.9086294416243654
Recall: 0.9086294416243654
F1-Score: 0.9086294416243654
AUC Score: 0.9100356674033968
MSE: 0.09137055837563451
G-Mean 0.910023617988648
Kappa: 0.8032645403762636
MCC: 0.8051419410814323
Confusion Matrix:
 [[10436  1091]
 [  529  5674]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [21]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [22]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95     11527
           1       0.89      0.92      0.90      6203

    accuracy                           0.93     17730
   macro avg       0.92      0.93      0.93     17730
weighted avg       0.93      0.93      0.93     17730

Accuracy: 0.9315284827975183
Precision: 0.9315284827975183
Recall: 0.9315284827975183
F1-Score: 0.9315284827975183
AUC Score: 0.9290239944540837
MSE: 0.06847151720248167
G-Mean 0.9289865548513777
Kappa: 0.8507630413804119
MCC: 0.8511033008168584
Confusion Matrix:
 [[10805   722]
 [  492  5711]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [23]:
from tensorflow.keras.layers import SimpleRNN

In [24]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.82      0.81     11527
           1       0.66      0.63      0.64      6203

    accuracy                           0.76     17730
   macro avg       0.73      0.73      0.73     17730
weighted avg       0.75      0.76      0.75     17730

Accuracy: 0.7555555555555555
Precision: 0.7555555555555555
Recall: 0.7555555555555555
F1-Score: 0.7555555555555555
AUC Score: 0.7262292774797386
MSE: 0.24444444444444444
G-Mean 0.7196325509293594
Kappa: 0.4571216289899115
MCC: 0.45739110124483034
Confusion Matrix:
 [[9497 2030]
 [2304 3899]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [25]:
from tensorflow.keras.layers import LSTM

In [26]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10