Importing libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Reading dataset

In [14]:
df = pd.read_csv('UCI.csv')
print (df.Result.value_counts())
df.head()

Result
 1    6157
-1    4898
Name: count, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


Data pre-processing

In [15]:
Label = LabelEncoder()
df['Result'] = Label.fit_transform(df['Result'])
print (df.Result.value_counts())
df.head()

Result
1    6157
0    4898
Name: count, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,0
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,0
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,0
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,0
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


Data Cleaning

In [16]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [17]:
input = df.drop (['Result'], axis='columns')
target = df.Result
print(target.head())
input.head()

0    0
1    0
2    0
3    0
4    1
Name: Result, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,1,-1,-1,-1,-1,1,1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,1,-1,-1,0,-1,1,1,1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,1,-1,1,-1,1,0,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,1,-1,1,-1,1
4,1,0,-1,1,1,-1,1,1,-1,1,...,1,-1,1,-1,-1,0,-1,1,1,1


Data splitting

In [18]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

2211
8844


In [19]:
y_train.value_counts()

Result
1    4913
0    3931
Name: count, dtype: int64

In [20]:
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)
# y_train.value_counts()

In [21]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [22]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       967
           1       0.93      0.95      0.94      1244

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211

Accuracy: 0.9298959746720941
Precision: 0.9298959746720941
Recall: 0.9298959746720941
F1-Score: 0.9298959746720941
AUC Score: 0.9276843221818399
MSE: 0.07010402532790593
G-Mean 0.9275163409221902
Kappa: 0.8572439019208684
MCC: 0.8573751845629586
Confusion Matrix:
 [[ 880   87]
 [  68 1176]]


Decision Tree

In [23]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.95       967
           1       0.96      0.96      0.96      1244

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211

Accuracy: 0.9534147444595206
Precision: 0.9534147444595206
Recall: 0.9534147444595206
F1-Score: 0.9534147444595206
AUC Score: 0.9523840598263599
MSE: 0.046585255540479424
G-Mean 0.95234852652679
Kappa: 0.9052892667157688
MCC: 0.905298834871125
Confusion Matrix:
 [[ 913   54]
 [  49 1195]]


Naive Bayes

In [24]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.52      1.00      0.68       967
           1       1.00      0.27      0.43      1244

    accuracy                           0.59      2211
   macro avg       0.76      0.64      0.56      2211
weighted avg       0.79      0.59      0.54      2211

Accuracy: 0.5920398009950248
Precision: 0.5920398009950248
Recall: 0.5920398009950248
F1-Score: 0.5920398009950248
AUC Score: 0.637459807073955
MSE: 0.4079601990049751
G-Mean 0.5243277735805246
Kappa: 0.2490550691894945
MCC: 0.3771476875356637
Confusion Matrix:
 [[967   0]
 [902 342]]


Random Forest

In [25]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96       967
           1       0.97      0.98      0.97      1244

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211

Accuracy: 0.968340117593849
Precision: 0.968340117593849
Recall: 0.968340117593849
F1-Score: 0.968340117593849
AUC Score: 0.9669141974549191
MSE: 0.031659882406151064
G-Mean 0.9668472081511098
Kappa: 0.9355518133938298
MCC: 0.9356533385878021
Confusion Matrix:
 [[ 924   43]
 [  27 1217]]


LightBoost

In [26]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 4913, number of negative: 3931
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 8844, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.555518 -> initscore=0.222991
[LightGBM] [Info] Start training from score 0.222991
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       967
           1       0.97      0.97      0.97      1244

    accuracy                           0.97      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.97      0.97      0.97      2211

Accuracy: 0.9651741293532339
Precision: 0.9651741293532339
Recall: 0.9651741293532339
F1-Score: 0.9651741293532339
AUC Score: 0.9642158264530137
MSE: 0.03482587064676617
G-Mean 0.9641854857154699
Kappa:

CatBoost

In [27]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.026131
0:	learn: 0.6503509	total: 154ms	remaining: 2m 34s
1:	learn: 0.6070366	total: 172ms	remaining: 1m 25s
2:	learn: 0.5686019	total: 193ms	remaining: 1m 4s
3:	learn: 0.5337101	total: 206ms	remaining: 51.4s
4:	learn: 0.5040325	total: 215ms	remaining: 42.7s
5:	learn: 0.4737876	total: 222ms	remaining: 36.8s
6:	learn: 0.4482424	total: 228ms	remaining: 32.4s
7:	learn: 0.4236776	total: 233ms	remaining: 28.9s
8:	learn: 0.4013360	total: 236ms	remaining: 25.9s
9:	learn: 0.3822166	total: 239ms	remaining: 23.6s
10:	learn: 0.3646077	total: 242ms	remaining: 21.8s
11:	learn: 0.3476154	total: 245ms	remaining: 20.2s
12:	learn: 0.3335495	total: 248ms	remaining: 18.8s
13:	learn: 0.3206872	total: 251ms	remaining: 17.7s
14:	learn: 0.3107724	total: 255ms	remaining: 16.7s
15:	learn: 0.3004659	total: 258ms	remaining: 15.9s
16:	learn: 0.2899363	total: 261ms	remaining: 15.1s
17:	learn: 0.2806191	total: 264ms	remaining: 14.4s
18:	learn: 0.2733736	total: 267ms	remaining: 13.8s
19:	learn

Gradient Boost

In [28]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       967
           1       0.94      0.96      0.95      1244

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211

Accuracy: 0.9479873360470376
Precision: 0.9479873360470376
Recall: 0.9479873360470376
F1-Score: 0.9479873360470376
AUC Score: 0.945718767561025
MSE: 0.05201266395296246
G-Mean 0.945545399320231
Kappa: 0.8940108071335585
MCC: 0.8942482139830485
Confusion Matrix:
 [[ 897   70]
 [  45 1199]]


SVM

In [29]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.94       967
           1       0.94      0.96      0.95      1244

    accuracy                           0.95      2211
   macro avg       0.95      0.94      0.94      2211
weighted avg       0.95      0.95      0.95      2211

Accuracy: 0.9457259158751696
Precision: 0.9457259158751696
Recall: 0.9457259158751696
F1-Score: 0.9457259158751696
AUC Score: 0.9432485859737911
MSE: 0.054274084124830396
G-Mean 0.943041296522974
Kappa: 0.889364257299407
MCC: 0.8896607481453599
Confusion Matrix:
 [[ 893   74]
 [  46 1198]]


KNN

In [30]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94       967
           1       0.95      0.95      0.95      1244

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211

Accuracy: 0.9448213478064225
Precision: 0.9448213478064225
Recall: 0.9448213478064225
F1-Score: 0.9448213478064225
AUC Score: 0.9434809318441029
MSE: 0.055178652193577565
G-Mean 0.9434202654797713
Kappa: 0.8877795713239699
MCC: 0.8878036092475077
Confusion Matrix:
 [[ 902   65]
 [  57 1187]]


Deep Learning

In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [32]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       967
           1       0.96      0.96      0.96      1244

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211

Accuracy: 0.9588421528720036
Precision: 0.9588421528720036
Recall: 0.9588421528720036
F1-Score: 0.9588421528720036
AUC Score: 0.9580131477004826
MSE: 0.04115784712799638
G-Mean 0.9579902950729381
Kappa: 0.9163428015459042
MCC: 0.9163462864979987
Confusion Matrix:
 [[ 920   47]
 [  44 1200]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [33]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [34]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94       967
           1       0.96      0.94      0.95      1244

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211

Accuracy: 0.9439167797376753
Precision: 0.9439167797376753
Recall: 0.9439167797376753
F1-Score: 0.9439167797376753
AUC Score: 0.9438284115356608
MSE: 0.05608322026232474
G-Mean 0.9438281479711068
Kappa: 0.8862283351052838
MCC: 0.8863014574840928
Confusion Matrix:
 [[ 912   55]
 [  69 1175]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [35]:
from tensorflow.keras.layers import SimpleRNN

In [36]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       967
           1       0.94      0.94      0.94      1244

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211

Accuracy: 0.9308005427408412
Precision: 0.9308005427408412
Recall: 0.9308005427408412
F1-Score: 0.9308005427408412
AUC Score: 0.9295243850939526
MSE: 0.06919945725915876
G-Mean 0.9294685703602125
Kappa: 0.8593455894123443
MCC: 0.8593488575977949
Confusion Matrix:
 [[ 889   78]
 [  75 1169]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [37]:
from tensorflow.keras.layers import LSTM

In [38]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.90      0.77       967
           1       0.89      0.66      0.76      1244

    accuracy                           0.76      2211
   macro avg       0.78      0.78      0.76      2211
weighted avg       0.80      0.76      0.76      2211

Accuracy: 0.7634554500226142
Precision: 0.7634554500226142
Recall: 0.7634554500226142
F1-Score: 0.7634554500226142
AUC Score: 0.7787381499449685
MSE: 0.2365445499773858
G-Mean 0.7691245580620757
Kappa: 0.5370111824111551
MCC: 0.5616646628406121
Confusion Matrix:
 [[871  96]
 [427 817]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
