Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Reading dataset

In [2]:
df = pd.read_csv('kaggle_2.csv')
print (df.status.value_counts())
df.head()

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


data pre-processing

In [3]:
df = df.drop(['url'], axis='columns')
Label = LabelEncoder()
df['status'] = Label.fit_transform(df['status'])
print (df.status.value_counts())
df.head()

status
0    5715
1    5715
Name: count, dtype: int64


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,0
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,1
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,1
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,0
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,0


In [4]:
input = df.drop (['status'], axis='columns')
target = df.status
print(target.head())
input.head()

0    0
1    1
2    1
3    0
4    0
Name: status, dtype: int32


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,37,19,0,3,0,0,0,0,0,0,...,0,0,1,0,45,-1,0,1,1,4
1,77,23,1,1,0,0,0,0,0,0,...,0,1,0,0,77,5767,0,0,1,2
2,126,50,1,4,1,0,1,2,0,3,...,0,1,0,0,14,4004,5828815,0,1,0
3,18,11,0,2,0,0,0,0,0,0,...,0,1,0,0,62,-1,107721,0,0,3
4,55,15,0,2,2,0,0,0,0,0,...,0,0,1,0,224,8175,8725,0,0,6


Data splitting

In [5]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

2286
9144


In [6]:
def report (y_test, y_pred):
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred, multi_class='ovo')
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [7]:
model = LogisticRegression ()
model.fit (X_train, y_train)
print (model.get_params())
y_pred = model.predict (X_test)
report (y_test, y_pred)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77      1154
           1       0.76      0.78      0.77      1132

    accuracy                           0.77      2286
   macro avg       0.77      0.77      0.77      2286
weighted avg       0.77      0.77      0.77      2286

Accuracy: 0.773403324584427
Precision: 0.773403324584427
Recall: 0.773403324584427
F1-Score: 0.773403324584427
AUC Score: 0.7735002235273225
MSE: 0.22659667541557305
G-Mean 0.7734346885538574
Kappa: 0.5468715082088661
MCC: 0.547035623878444
Confusion Matrix:
 [[881 273]
 [245 887]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [8]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.94      1154
           1       0.92      0.95      0.94      1132

    accuracy                           0.94      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.94      0.94      0.94      2286

Accuracy: 0.9352580927384077
Precision: 0.9352580927384077
Recall: 0.9352580927384077
F1-Score: 0.9352580927384077
AUC Score: 0.9353699836488233
MSE: 0.0647419072615923
G-Mean 0.9352977232995456
Kappa: 0.8705347166311046
MCC: 0.8707959633513416
Confusion Matrix:
 [[1066   88]
 [  60 1072]]


Naive Bayes

In [9]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71      1154
           1       0.69      0.86      0.77      1132

    accuracy                           0.74      2286
   macro avg       0.75      0.74      0.74      2286
weighted avg       0.76      0.74      0.74      2286

Accuracy: 0.7414698162729659
Precision: 0.7414698162729659
Recall: 0.7414698162729659
F1-Score: 0.7414698162729659
AUC Score: 0.7425531719445652
MSE: 0.2585301837270341
G-Mean 0.7339707725970226
Kappa: 0.4840343990815912
MCC: 0.497321363962026
Confusion Matrix:
 [[727 427]
 [164 968]]


KNN

In [10]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83      1154
           1       0.82      0.84      0.83      1132

    accuracy                           0.83      2286
   macro avg       0.83      0.83      0.83      2286
weighted avg       0.83      0.83      0.83      2286

Accuracy: 0.8333333333333334
Precision: 0.8333333333333334
Recall: 0.8333333333333334
F1-Score: 0.8333333333333334
AUC Score: 0.8334399936310023
MSE: 0.16666666666666666
G-Mean 0.8333663004096068
Kappa: 0.6667171776309899
MCC: 0.6669318090163201
Confusion Matrix:
 [[949 205]
 [176 956]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      1154
           1       0.96      0.98      0.97      1132

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286

Accuracy: 0.9663167104111986
Precision: 0.9663167104111986
Recall: 0.9663167104111986
F1-Score: 0.9663167104111986
AUC Score: 0.966402006234269
MSE: 0.0336832895888014
G-Mean 0.9663613634049885
Kappa: 0.9326390924558324
MCC: 0.9327965148691263
Confusion Matrix:
 [[1105   49]
 [  28 1104]]


SVM

In [12]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.29      0.41      1154
           1       0.55      0.88      0.68      1132

    accuracy                           0.58      2286
   macro avg       0.63      0.59      0.54      2286
weighted avg       0.63      0.58      0.54      2286

Accuracy: 0.5822397200349956
Precision: 0.5822397200349956
Recall: 0.5822397200349956
F1-Score: 0.5822397200349956
AUC Score: 0.585051380663968
MSE: 0.41776027996500437
G-Mean 0.5068819852727229
Kappa: 0.16913855112176734
MCC: 0.20929507484137233
Confusion Matrix:
 [[338 816]
 [139 993]]


Light Boost

In [13]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 4583, number of negative: 4561
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 9144, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501203 -> initscore=0.004812
[LightGBM] [Info] Start training from score 0.004812
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      1154
           1       0.97      0.98      0.97      1132

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286

Accuracy: 0.9724409448818898
Precision: 0.9724409448818898
Recall: 0.9724409448818898
F1-Score: 0.9724409448818898
AUC Score: 0.9725015463191479
MSE: 0.027559055118110236
G-Mean 0.9724811591373801
Kappa: 0.94488374602849
MCC: 0.9449651162099107
Confusion Matrix:
 

Cat Boost

In [14]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.026506
0:	learn: 0.6566286	total: 161ms	remaining: 2m 41s
1:	learn: 0.6253991	total: 187ms	remaining: 1m 33s
2:	learn: 0.5944589	total: 205ms	remaining: 1m 8s
3:	learn: 0.5621414	total: 226ms	remaining: 56.3s
4:	learn: 0.5346694	total: 243ms	remaining: 48.4s
5:	learn: 0.5089243	total: 262ms	remaining: 43.4s
6:	learn: 0.4836444	total: 277ms	remaining: 39.3s
7:	learn: 0.4614161	total: 290ms	remaining: 36s
8:	learn: 0.4415573	total: 307ms	remaining: 33.9s
9:	learn: 0.4214909	total: 321ms	remaining: 31.8s
10:	learn: 0.4031077	total: 334ms	remaining: 30.1s
11:	learn: 0.3895153	total: 348ms	remaining: 28.7s
12:	learn: 0.3756334	total: 362ms	remaining: 27.5s
13:	learn: 0.3627144	total: 377ms	remaining: 26.5s
14:	learn: 0.3502041	total: 385ms	remaining: 25.3s
15:	learn: 0.3369802	total: 399ms	remaining: 24.5s
16:	learn: 0.3276677	total: 412ms	remaining: 23.8s
17:	learn: 0.3184710	total: 423ms	remaining: 23.1s
18:	learn: 0.3088853	total: 436ms	remaining: 22.5s
19:	learn: 

Gradient Boost

In [15]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96      1154
           1       0.95      0.97      0.96      1132

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286

Accuracy: 0.9619422572178478
Precision: 0.9619422572178478
Recall: 0.9619422572178478
F1-Score: 0.9619422572178478
AUC Score: 0.9620103067529748
MSE: 0.03805774278215223
G-Mean 0.9619843199076733
Kappa: 0.9238883594907061
MCC: 0.923990552939443
Confusion Matrix:
 [[1102   52]
 [  35 1097]]


Deep Learning

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [17]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/12


Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.98      0.75      1154
           1       0.95      0.37      0.53      1132

    accuracy                           0.68      2286
   macro avg       0.78      0.67      0.64      2286
weighted avg       0.78      0.68      0.64      2286

Accuracy: 0.6776027996500438
Precision: 0.6776027996500438
Recall: 0.6776027996500438
F1-Score: 0.6776027996500438
AUC Score: 0.6746467962104463
MSE: 0.3223972003499563
G-Mean 0.6006693920242638
Kappa: 0.3513497905185484
MCC: 0.44412739513731014
Confusion Matrix:
 [[1133   21]
 [ 716  416]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [18]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [19]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.96      0.83      1154
           1       0.94      0.65      0.77      1132

    accuracy                           0.81      2286
   macro avg       0.84      0.80      0.80      2286
weighted avg       0.84      0.81      0.80      2286

Accuracy: 0.8062117235345582
Precision: 0.8062117235345582
Recall: 0.8062117235345582
F1-Score: 0.8062117235345583
AUC Score: 0.8047243877494779
MSE: 0.19378827646544183
G-Mean 0.789744481327715
Kappa: 0.6112450853711624
MCC: 0.6421004219582058
Confusion Matrix:
 [[1107   47]
 [ 396  736]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [20]:
from tensorflow.keras.layers import SimpleRNN

In [21]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.86      0.78      1154
           1       0.82      0.66      0.73      1132

    accuracy                           0.76      2286
   macro avg       0.77      0.76      0.76      2286
weighted avg       0.77      0.76      0.76      2286

Accuracy: 0.7589676290463692
Precision: 0.7589676290463692
Recall: 0.7589676290463692
F1-Score: 0.7589676290463692
AUC Score: 0.7580148324157486
MSE: 0.2410323709536308
G-Mean 0.7515215554233818
Kappa: 0.5169917562506854
MCC: 0.5269752866544064
Confusion Matrix:
 [[989 165]
 [386 746]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [22]:
from tensorflow.keras.layers import LSTM

In [23]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.56      0.55      1154
           1       0.53      0.51      0.52      1132

    accuracy                           0.53      2286
   macro avg       0.53      0.53      0.53      2286
weighted avg       0.53      0.53      0.53      2286

Accuracy: 0.5332458442694663
Precision: 0.5332458442694663
Recall: 0.5332458442694663
F1-Score: 0.5332458442694663
AUC Score: 0.5330299894054173
MSE: 0.4667541557305337
G-Mean 0.5325578812518673
Kappa: 0.06608279340244139
MCC: 0.06612538596243664
Confusion Matrix:
 [[641 513]
 [554 578]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
