Importing libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Reading dataset

In [5]:
df = pd.read_csv('kaggle_2.csv')
print (df.status.value_counts())
df.head()

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


data pre-processing

In [6]:
df = df.drop(['url'], axis='columns')
Label = LabelEncoder()
df['status'] = Label.fit_transform(df['status'])
print (df.status.value_counts())
df.head()

status
0    5715
1    5715
Name: count, dtype: int64


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,0
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,1
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,1
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,0
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,0


In [7]:
input = df.drop (['status'], axis='columns')
target = df.status
print(target.head())
input.head()

0    0
1    1
2    1
3    0
4    0
Name: status, dtype: int32


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,37,19,0,3,0,0,0,0,0,0,...,0,0,1,0,45,-1,0,1,1,4
1,77,23,1,1,0,0,0,0,0,0,...,0,1,0,0,77,5767,0,0,1,2
2,126,50,1,4,1,0,1,2,0,3,...,0,1,0,0,14,4004,5828815,0,1,0
3,18,11,0,2,0,0,0,0,0,0,...,0,1,0,0,62,-1,107721,0,0,3
4,55,15,0,2,2,0,0,0,0,0,...,0,0,1,0,224,8175,8725,0,0,6


Data splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3429
8001


In [9]:
def report (y_test, y_pred):
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred, multi_class='ovo')
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [7]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78      1728
           1       0.77      0.79      0.78      1701

    accuracy                           0.78      3429
   macro avg       0.78      0.78      0.78      3429
weighted avg       0.78      0.78      0.78      3429

Accuracy: 0.7809857101195684
Precision: 0.7809857101195684
Recall: 0.7809857101195684
F1-Score: 0.7809857101195684
AUC Score: 0.7810800631981186
MSE: 0.21901428988043162
G-Mean 0.7809881411697129
Kappa: 0.5620388015153255
MCC: 0.5622500376441574
Confusion Matrix:
 [[1329  399]
 [ 352 1349]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [8]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      1728
           1       0.93      0.94      0.94      1701

    accuracy                           0.94      3429
   macro avg       0.94      0.94      0.94      3429
weighted avg       0.94      0.94      0.94      3429

Accuracy: 0.9364246135899679
Precision: 0.9364246135899679
Recall: 0.9364246135899679
F1-Score: 0.9364246135899679
AUC Score: 0.9364390432098766
MSE: 0.06357538641003208
G-Mean 0.9364372500949226
Kappa: 0.8728460153580483
MCC: 0.872855517940913
Confusion Matrix:
 [[1615  113]
 [ 105 1596]]


Naive Bayes

In [9]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.59      0.70      1728
           1       0.68      0.88      0.77      1701

    accuracy                           0.74      3429
   macro avg       0.76      0.74      0.73      3429
weighted avg       0.76      0.74      0.73      3429

Accuracy: 0.7381160688247302
Precision: 0.7381160688247302
Recall: 0.7381160688247302
F1-Score: 0.7381160688247301
AUC Score: 0.7392618312757202
MSE: 0.26188393117526976
G-Mean 0.7247995323811389
Kappa: 0.47741415178203206
MCC: 0.49956347099459997
Confusion Matrix:
 [[1026  702]
 [ 196 1505]]


KNN

In [10]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.81      0.82      1728
           1       0.81      0.84      0.83      1701

    accuracy                           0.82      3429
   macro avg       0.82      0.82      0.82      3429
weighted avg       0.82      0.82      0.82      3429

Accuracy: 0.8241469816272966
Precision: 0.8241469816272966
Recall: 0.8241469816272966
F1-Score: 0.8241469816272966
AUC Score: 0.8242486037624926
MSE: 0.17585301837270342
G-Mean 0.8241475569824259
Kappa: 0.6483512951321765
MCC: 0.6486161585810514
Confusion Matrix:
 [[1402  326]
 [ 277 1424]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1728
           1       0.97      0.97      0.97      1701

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429

Accuracy: 0.9667541557305337
Precision: 0.9667541557305337
Recall: 0.9667541557305337
F1-Score: 0.9667541557305337
AUC Score: 0.9667475014697237
MSE: 0.033245844269466314
G-Mean 0.9667471320976067
Kappa: 0.9335035778914783
MCC: 0.9335042131248126
Confusion Matrix:
 [[1672   56]
 [  58 1643]]


SVM

In [12]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.33      0.46      1728
           1       0.57      0.89      0.69      1701

    accuracy                           0.61      3429
   macro avg       0.66      0.61      0.57      3429
weighted avg       0.66      0.61      0.57      3429

Accuracy: 0.6062992125984252
Precision: 0.6062992125984252
Recall: 0.6062992125984252
F1-Score: 0.6062992125984252
AUC Score: 0.6085207231040564
MSE: 0.3937007874015748
G-Mean 0.5391651867187541
Kappa: 0.21607088968752575
MCC: 0.2625123331866507
Confusion Matrix:
 [[ 564 1164]
 [ 186 1515]]


Light Boost

In [13]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 4014, number of negative: 3987
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4818
[LightGBM] [Info] Number of data points in the train set: 8001, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501687 -> initscore=0.006749
[LightGBM] [Info] Start training from score 0.006749
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1728
           1       0.97      0.97      0.97      1701

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429

Accuracy: 0.9687955672207641
Precision: 0.9687955672207641
Recall: 0.9687955672207641
F1-Score: 0.9687955672207641
AUC Score: 0.9687867430922988
MSE: 0.03120443277923593
G-Mean 0.9687860949161463
Kapp

Cat Boost

In [14]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.025037
0:	learn: 0.6534968	total: 150ms	remaining: 2m 29s
1:	learn: 0.6203327	total: 155ms	remaining: 1m 17s
2:	learn: 0.5920987	total: 161ms	remaining: 53.6s
3:	learn: 0.5630204	total: 166ms	remaining: 41.4s
4:	learn: 0.5364898	total: 171ms	remaining: 34.1s
5:	learn: 0.5141459	total: 177ms	remaining: 29.3s
6:	learn: 0.4911047	total: 182ms	remaining: 25.9s
7:	learn: 0.4692787	total: 187ms	remaining: 23.2s
8:	learn: 0.4494379	total: 194ms	remaining: 21.3s
9:	learn: 0.4299626	total: 200ms	remaining: 19.8s
10:	learn: 0.4125620	total: 206ms	remaining: 18.5s
11:	learn: 0.3965035	total: 212ms	remaining: 17.5s
12:	learn: 0.3820164	total: 217ms	remaining: 16.5s
13:	learn: 0.3685792	total: 223ms	remaining: 15.7s
14:	learn: 0.3572684	total: 228ms	remaining: 15s
15:	learn: 0.3446822	total: 233ms	remaining: 14.3s
16:	learn: 0.3345706	total: 237ms	remaining: 13.7s
17:	learn: 0.3231471	total: 243ms	remaining: 13.3s
18:	learn: 0.3144515	total: 248ms	remaining: 12.8s
19:	learn: 

Gradient Boost

In [15]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96      1728
           1       0.96      0.95      0.96      1701

    accuracy                           0.96      3429
   macro avg       0.96      0.96      0.96      3429
weighted avg       0.96      0.96      0.96      3429

Accuracy: 0.9559638378536016
Precision: 0.9559638378536016
Recall: 0.9559638378536016
F1-Score: 0.9559638378536016
AUC Score: 0.9559220679012346
MSE: 0.04403616214639837
G-Mean 0.9559073486315552
Kappa: 0.9119153373049593
MCC: 0.9119601809697692
Confusion Matrix:
 [[1661   67]
 [  84 1617]]


Deep Learning

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [15]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.71      0.77      1739
           1       0.75      0.87      0.80      1690

    accuracy                           0.79      3429
   macro avg       0.80      0.79      0.79      3429
weighted avg       0.80      0.79      0.79      3429

Accuracy: 0.7885680956547099
Precision: 0.7885680956547099
Recall: 0.7885680956547099
F1-Score: 0.7885680956547099
AUC Score: 0.789662834180019
MSE: 0.21143190434529016
G-Mean 0.7859379100067614
Kappa: 0.5780102074332262
MCC: 0.585448610652845
Confusion Matrix:
 [[1240  499]
 [ 226 1464]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [21]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [32]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.81      0.84      1728
           1       0.82      0.88      0.85      1701

    accuracy                           0.85      3429
   macro avg       0.85      0.85      0.85      3429
weighted avg       0.85      0.85      0.85      3429

Accuracy: 0.8468941382327209
Precision: 0.8468941382327209
Recall: 0.8468941382327209
F1-Score: 0.846894138232721
AUC Score: 0.8471717004703117
MSE: 0.15310586176727908
G-Mean 0.8464380066391914
Kappa: 0.6939449996098286
MCC: 0.6957948187042661
Confusion Matrix:
 [[1403  325]
 [ 200 1501]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [23]:
from tensorflow.keras.layers import SimpleRNN

In [34]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.88      0.78      1728
           1       0.83      0.63      0.72      1701

    accuracy                           0.76      3429
   macro avg       0.77      0.75      0.75      3429
weighted avg       0.77      0.76      0.75      3429

Accuracy: 0.7556138815981336
Precision: 0.7556138815981336
Recall: 0.7556138815981336
F1-Score: 0.7556138815981335
AUC Score: 0.7546663727219283
MSE: 0.24438611840186644
G-Mean 0.7450108403663496
Kappa: 0.5102842786280266
MCC: 0.5252832257299088
Confusion Matrix:
 [[1512  216]
 [ 622 1079]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [25]:
from tensorflow.keras.layers import LSTM

In [26]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.34      0.47      1728
           1       0.58      0.92      0.71      1701

    accuracy                           0.62      3429
   macro avg       0.69      0.63      0.59      3429
weighted avg       0.69      0.62      0.59      3429

Accuracy: 0.6237970253718286
Precision: 0.6237970253718286
Recall: 0.6237970253718286
F1-Score: 0.6237970253718286
AUC Score: 0.6260839212228101
MSE: 0.3762029746281715
G-Mean 0.5546423515494047
Kappa: 0.25100807373339307
MCC: 0.30924380040644506
Confusion Matrix:
 [[ 580 1148]
 [ 142 1559]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
