Importing libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Reading dataset

In [3]:
df = pd.read_csv('kaggle_2.csv')
print (df.status.value_counts())
df.head()

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


data pre-processing

In [4]:
df = df.drop(['url'], axis='columns')
Label = LabelEncoder()
df['status'] = Label.fit_transform(df['status'])
print (df.status.value_counts())
df.head()

status
0    5715
1    5715
Name: count, dtype: int64


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,0
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,1
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,1
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,0
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,0


In [5]:
input = df.drop (['status'], axis='columns')
target = df.status
print(target.head())
input.head()

0    0
1    1
2    1
3    0
4    0
Name: status, dtype: int32


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,37,19,0,3,0,0,0,0,0,0,...,0,0,1,0,45,-1,0,1,1,4
1,77,23,1,1,0,0,0,0,0,0,...,0,1,0,0,77,5767,0,0,1,2
2,126,50,1,4,1,0,1,2,0,3,...,0,1,0,0,14,4004,5828815,0,1,0
3,18,11,0,2,0,0,0,0,0,0,...,0,1,0,0,62,-1,107721,0,0,3
4,55,15,0,2,2,0,0,0,0,0,...,0,0,1,0,224,8175,8725,0,0,6


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3429
8001


In [7]:
def report (y_test, y_pred):
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred, multi_class='ovo')
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [8]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.78      0.78      1720
           1       0.78      0.79      0.79      1709

    accuracy                           0.79      3429
   macro avg       0.79      0.79      0.79      3429
weighted avg       0.79      0.79      0.79      3429

Accuracy: 0.7859434237386993
Precision: 0.7859434237386993
Recall: 0.7859434237386993
F1-Score: 0.7859434237386992
AUC Score: 0.7859711581640291
MSE: 0.21405657626130067
G-Mean 0.7859236066361933
Kappa: 0.5719080735977271
MCC: 0.5720077039388838
Confusion Matrix:
 [[1337  383]
 [ 351 1358]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [9]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      1720
           1       0.93      0.93      0.93      1709

    accuracy                           0.93      3429
   macro avg       0.93      0.93      0.93      3429
weighted avg       0.93      0.93      0.93      3429

Accuracy: 0.9308836395450568
Precision: 0.9308836395450568
Recall: 0.9308836395450568
F1-Score: 0.9308836395450568
AUC Score: 0.9308801216541701
MSE: 0.06911636045494313
G-Mean 0.9308794757163277
Kappa: 0.8617650806005925
MCC: 0.8617663998935329
Confusion Matrix:
 [[1603  117]
 [ 120 1589]]


Naive Bayes

In [10]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.62      0.71      1720
           1       0.70      0.88      0.78      1709

    accuracy                           0.75      3429
   macro avg       0.77      0.75      0.74      3429
weighted avg       0.77      0.75      0.74      3429

Accuracy: 0.7489063867016623
Precision: 0.7489063867016623
Recall: 0.7489063867016623
F1-Score: 0.7489063867016623
AUC Score: 0.7493126335270184
MSE: 0.2510936132983377
G-Mean 0.7385338078141529
Kappa: 0.4982178934006054
MCC: 0.515206519162884
Confusion Matrix:
 [[1071  649]
 [ 212 1497]]


KNN

In [11]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.80      0.83      1720
           1       0.81      0.86      0.84      1709

    accuracy                           0.83      3429
   macro avg       0.83      0.83      0.83      3429
weighted avg       0.83      0.83      0.83      3429

Accuracy: 0.8323126275882181
Precision: 0.8323126275882181
Recall: 0.8323126275882181
F1-Score: 0.8323126275882182
AUC Score: 0.8324072625090152
MSE: 0.16768737241178186
G-Mean 0.8318843572381596
Kappa: 0.6646864258779581
MCC: 0.6658887039419629
Confusion Matrix:
 [[1381  339]
 [ 236 1473]]


Random Forest

In [12]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      1720
           1       0.96      0.96      0.96      1709

    accuracy                           0.96      3429
   macro avg       0.96      0.96      0.96      3429
weighted avg       0.96      0.96      0.96      3429

Accuracy: 0.9635462233887431
Precision: 0.9635462233887431
Recall: 0.9635462233887431
F1-Score: 0.9635462233887431
AUC Score: 0.9635486548641257
MSE: 0.03645377661125693
G-Mean 0.9635483567477889
Kappa: 0.9270921057398775
MCC: 0.9270935250113654
Confusion Matrix:
 [[1656   64]
 [  61 1648]]


SVM

In [13]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.30      0.43      1720
           1       0.56      0.89      0.69      1709

    accuracy                           0.59      3429
   macro avg       0.65      0.60      0.56      3429
weighted avg       0.65      0.59      0.56      3429

Accuracy: 0.594925634295713
Precision: 0.594925634295713
Recall: 0.594925634295713
F1-Score: 0.594925634295713
AUC Score: 0.5958691673357193
MSE: 0.40507436570428695
G-Mean 0.5182186410801783
Kappa: 0.19137560073227933
MCC: 0.23696840124619323
Confusion Matrix:
 [[ 519 1201]
 [ 188 1521]]


Light Boost

In [14]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 4006, number of negative: 3995
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4853
[LightGBM] [Info] Number of data points in the train set: 8001, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500687 -> initscore=0.002750
[LightGBM] [Info] Start training from score 0.002750
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1720
           1       0.97      0.96      0.97      1709

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429

Accuracy: 0.9664625255176437
Precision: 0.9664625255176437
Recall: 0.9664625255176437
F1-Score: 0.9664625255176437
AUC Score: 0.96645750268755
MSE: 0.033537474482356375
G-Mean 0.9664562343521839
Kappa: 0.9329237332462423
MCC: 0.9329277006007629
Confusion Matrix:
 

Cat Boost

In [15]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.025037
0:	learn: 0.6550415	total: 155ms	remaining: 2m 34s
1:	learn: 0.6197172	total: 168ms	remaining: 1m 23s
2:	learn: 0.5921751	total: 181ms	remaining: 1m
3:	learn: 0.5616103	total: 194ms	remaining: 48.3s
4:	learn: 0.5349656	total: 206ms	remaining: 41s
5:	learn: 0.5119371	total: 220ms	remaining: 36.4s
6:	learn: 0.4893308	total: 229ms	remaining: 32.5s
7:	learn: 0.4668608	total: 237ms	remaining: 29.4s
8:	learn: 0.4458715	total: 244ms	remaining: 26.9s
9:	learn: 0.4260074	total: 249ms	remaining: 24.6s
10:	learn: 0.4082273	total: 254ms	remaining: 22.8s
11:	learn: 0.3943380	total: 258ms	remaining: 21.3s
12:	learn: 0.3796349	total: 263ms	remaining: 20s
13:	learn: 0.3651765	total: 268ms	remaining: 18.9s
14:	learn: 0.3523339	total: 273ms	remaining: 17.9s
15:	learn: 0.3398253	total: 278ms	remaining: 17.1s
16:	learn: 0.3297833	total: 282ms	remaining: 16.3s
17:	learn: 0.3185691	total: 287ms	remaining: 15.6s
18:	learn: 0.3095720	total: 292ms	remaining: 15.1s
19:	learn: 0.302

Gradient Boost

In [16]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      1720
           1       0.96      0.96      0.96      1709

    accuracy                           0.96      3429
   macro avg       0.96      0.96      0.96      3429
weighted avg       0.96      0.96      0.96      3429

Accuracy: 0.9559638378536016
Precision: 0.9559638378536016
Recall: 0.9559638378536016
F1-Score: 0.9559638378536016
AUC Score: 0.9559624491406644
MSE: 0.04403616214639837
G-Mean 0.955962351123134
Kappa: 0.9119266045682655
MCC: 0.911926759687476
Confusion Matrix:
 [[1645   75]
 [  76 1633]]


Deep Learning

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

ANN

In [20]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.98      0.76      1720
           1       0.95      0.39      0.55      1709

    accuracy                           0.69      3429
   macro avg       0.79      0.69      0.66      3429
weighted avg       0.78      0.69      0.66      3429

Accuracy: 0.6862058909303004
Precision: 0.6862058909303004
Recall: 0.6862058909303004
F1-Score: 0.6862058909303004
AUC Score: 0.685257766679821
MSE: 0.3137941090696996
G-Mean 0.6182432751225077
Kappa: 0.37121705565070684
MCC: 0.4598559004471813
Confusion Matrix:
 [[1687   33]
 [1043  666]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [21]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [22]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.94      0.84      1720
           1       0.92      0.70      0.79      1709

    accuracy                           0.82      3429
   macro avg       0.84      0.82      0.82      3429
weighted avg       0.84      0.82      0.82      3429

Accuracy: 0.8191892680081656
Precision: 0.8191892680081656
Recall: 0.8191892680081656
F1-Score: 0.8191892680081656
AUC Score: 0.8188113203695891
MSE: 0.18081073199183437
G-Mean 0.810290830680953
Kappa: 0.638102604834627
MCC: 0.6564301209224659
Confusion Matrix:
 [[1611  109]
 [ 511 1198]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [23]:
from tensorflow.keras.layers import SimpleRNN

In [24]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.88      0.79      1720
           1       0.84      0.64      0.72      1709

    accuracy                           0.76      3429
   macro avg       0.78      0.76      0.76      3429
weighted avg       0.78      0.76      0.76      3429

Accuracy: 0.7591134441528142
Precision: 0.7591134441528142
Recall: 0.7591134441528142
F1-Score: 0.7591134441528142
AUC Score: 0.7587199096438826
MSE: 0.24088655584718577
G-Mean 0.7487366946455515
Kappa: 0.5178448314897857
MCC: 0.5339838592562746
Confusion Matrix:
 [[1516  204]
 [ 622 1087]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [25]:
from tensorflow.keras.layers import LSTM

In [26]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.93      0.72      1720
           1       0.83      0.34      0.48      1709

    accuracy                           0.64      3429
   macro avg       0.71      0.63      0.60      3429
weighted avg       0.71      0.64      0.60      3429

Accuracy: 0.6351706036745407
Precision: 0.6351706036745407
Recall: 0.6351706036745407
F1-Score: 0.6351706036745407
AUC Score: 0.6342247608420537
MSE: 0.3648293963254593
G-Mean 0.5615224566732128
Kappa: 0.26895628287067075
MCC: 0.33264941518049873
Confusion Matrix:
 [[1598  122]
 [1129  580]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
