Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('UCI.csv')
print (df.Result.value_counts())
df.head()

Result
 1    6157
-1    4898
Name: count, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


Data pre-processing

In [3]:
Label = LabelEncoder()
df['Result'] = Label.fit_transform(df['Result'])
print (df.Result.value_counts())
df.head()

Result
1    6157
0    4898
Name: count, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,0
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,0
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,0
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,0
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


Data Cleaning

In [4]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [5]:
input = df.drop (['Result'], axis='columns')
target = df.Result
print(target.head())
input.head()

0    0
1    0
2    0
3    0
4    1
Name: Result, dtype: int64


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,1,-1,-1,-1,-1,1,1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,1,-1,-1,0,-1,1,1,1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,1,-1,1,-1,1,0,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,1,-1,1,-1,1
4,1,0,-1,1,1,-1,1,1,-1,1,...,1,-1,1,-1,-1,0,-1,1,1,1


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3317
7738


In [7]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [8]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      1474
           1       0.93      0.94      0.94      1843

    accuracy                           0.93      3317
   macro avg       0.93      0.93      0.93      3317
weighted avg       0.93      0.93      0.93      3317

Accuracy: 0.929454326198372
Precision: 0.929454326198372
Recall: 0.929454326198372
F1-Score: 0.929454326198372
AUC Score: 0.9275516071298419
MSE: 0.07054567380162798
G-Mean 0.9273938981594371
Kappa: 0.8568490328644247
MCC: 0.8569933717704667
Confusion Matrix:
 [[1342  132]
 [ 102 1741]]


Decision Tree

In [9]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96      1474
           1       0.97      0.96      0.96      1843

    accuracy                           0.96      3317
   macro avg       0.96      0.96      0.96      3317
weighted avg       0.96      0.96      0.96      3317

Accuracy: 0.9608079589990955
Precision: 0.9608079589990955
Recall: 0.9608079589990955
F1-Score: 0.9608079589990955
AUC Score: 0.960588526317262
MSE: 0.03919204100090443
G-Mean 0.9605865010893706
Kappa: 0.9206768214659142
MCC: 0.9206877907544824
Confusion Matrix:
 [[1413   61]
 [  69 1774]]


Naive Bayes

In [10]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.53      1.00      0.69      1474
           1       0.99      0.29      0.44      1843

    accuracy                           0.60      3317
   macro avg       0.76      0.64      0.57      3317
weighted avg       0.79      0.60      0.55      3317

Accuracy: 0.6020500452215858
Precision: 0.6020500452215858
Recall: 0.6020500452215858
F1-Score: 0.6020500452215858
AUC Score: 0.6416165608106069
MSE: 0.3979499547784142
G-Mean 0.5340139049907277
Kappa: 0.26006017617518284
MCC: 0.3838111893543743
Confusion Matrix:
 [[1470    4]
 [1316  527]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      1474
           1       0.97      0.98      0.98      1843

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317

Accuracy: 0.9737714802532409
Precision: 0.9737714802532409
Recall: 0.9737714802532409
F1-Score: 0.9737714802532409
AUC Score: 0.9728655347050079
MSE: 0.02622851974675912
G-Mean 0.972831449399146
Kappa: 0.9468242498329154
MCC: 0.9468753663357291
Confusion Matrix:
 [[1422   52]
 [  35 1808]]


LightBoost

In [12]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 4314, number of negative: 3424
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 7738, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.557508 -> initscore=0.231056
[LightGBM] [Info] Start training from score 0.231056
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97      1474
           1       0.97      0.98      0.98      1843

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317

Accuracy: 0.9722640940608984
Precision: 0.9722640940608984
Recall: 0.9722640940608984
F1-Score: 0.9722640940608983
AUC Score: 0.9713053020302719
MSE: 0.027735905939101597
G-Mean 0.9712670627388873
Kappa

CatBoost

In [13]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.024682
0:	learn: 0.6530794	total: 137ms	remaining: 2m 17s
1:	learn: 0.6169994	total: 140ms	remaining: 1m 9s
2:	learn: 0.5811827	total: 142ms	remaining: 47.3s
3:	learn: 0.5509018	total: 146ms	remaining: 36.3s
4:	learn: 0.5223250	total: 149ms	remaining: 29.6s
5:	learn: 0.4933217	total: 152ms	remaining: 25.1s
6:	learn: 0.4669715	total: 155ms	remaining: 22s
7:	learn: 0.4437805	total: 158ms	remaining: 19.6s
8:	learn: 0.4220583	total: 162ms	remaining: 17.8s
9:	learn: 0.4040747	total: 166ms	remaining: 16.4s
10:	learn: 0.3864711	total: 169ms	remaining: 15.2s
11:	learn: 0.3694030	total: 172ms	remaining: 14.2s
12:	learn: 0.3542742	total: 176ms	remaining: 13.4s
13:	learn: 0.3408100	total: 180ms	remaining: 12.6s
14:	learn: 0.3277938	total: 183ms	remaining: 12s
15:	learn: 0.3181839	total: 186ms	remaining: 11.4s
16:	learn: 0.3085010	total: 190ms	remaining: 11s
17:	learn: 0.2995290	total: 192ms	remaining: 10.5s
18:	learn: 0.2903052	total: 196ms	remaining: 10.1s
19:	learn: 0.283

Gradient Boost

In [14]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      1474
           1       0.95      0.97      0.96      1843

    accuracy                           0.95      3317
   macro avg       0.95      0.95      0.95      3317
weighted avg       0.95      0.95      0.95      3317

Accuracy: 0.9547784142297256
Precision: 0.9547784142297256
Recall: 0.9547784142297256
F1-Score: 0.9547784142297256
AUC Score: 0.9534646846662461
MSE: 0.045221585770274346
G-Mean 0.9533915484974556
Kappa: 0.9082864919486398
MCC: 0.9083686748093421
Confusion Matrix:
 [[1388   86]
 [  64 1779]]


SVM

In [15]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.94      1474
           1       0.94      0.96      0.95      1843

    accuracy                           0.95      3317
   macro avg       0.95      0.94      0.95      3317
weighted avg       0.95      0.95      0.95      3317

Accuracy: 0.9460355743141393
Precision: 0.9460355743141393
Recall: 0.9460355743141393
F1-Score: 0.9460355743141393
AUC Score: 0.943695423145703
MSE: 0.05396442568586072
G-Mean 0.9434609364799684
Kappa: 0.890353850531931
MCC: 0.8907551781744351
Confusion Matrix:
 [[1360  114]
 [  65 1778]]


KNN

In [16]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      1474
           1       0.94      0.95      0.95      1843

    accuracy                           0.94      3317
   macro avg       0.94      0.94      0.94      3317
weighted avg       0.94      0.94      0.94      3317

Accuracy: 0.9424178474525173
Precision: 0.9424178474525173
Recall: 0.9424178474525173
F1-Score: 0.9424178474525173
AUC Score: 0.9409152751509064
MSE: 0.05758215254748267
G-Mean 0.9408183244777801
Kappa: 0.8832101892886894
MCC: 0.8832975462162573
Confusion Matrix:
 [[1367  107]
 [  84 1759]]


Deep Learning

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [18]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.95      1474
           1       0.98      0.93      0.95      1843

    accuracy                           0.95      3317
   macro avg       0.95      0.95      0.95      3317
weighted avg       0.95      0.95      0.95      3317

Accuracy: 0.9493518239372928
Precision: 0.9493518239372928
Recall: 0.9493518239372928
F1-Score: 0.9493518239372928
AUC Score: 0.9523846510063013
MSE: 0.050648176062707266
G-Mean 0.9519943675718876
Kappa: 0.8981811856570491
MCC: 0.9001114519485085
Confusion Matrix:
 [[1444   30]
 [ 138 1705]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [19]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [20]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.94      1474
           1       0.97      0.93      0.95      1843

    accuracy                           0.94      3317
   macro avg       0.94      0.94      0.94      3317
weighted avg       0.94      0.94      0.94      3317

Accuracy: 0.9430208019294544
Precision: 0.9430208019294544
Recall: 0.9430208019294544
F1-Score: 0.9430208019294544
AUC Score: 0.9448536801024229
MSE: 0.056979198070545675
G-Mean 0.9447100174525161
Kappa: 0.8851984099879158
MCC: 0.886118397618231
Confusion Matrix:
 [[1417   57]
 [ 132 1711]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [21]:
from tensorflow.keras.layers import SimpleRNN

In [22]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91      1474
           1       0.91      0.96      0.93      1843

    accuracy                           0.93      3317
   macro avg       0.93      0.92      0.92      3317
weighted avg       0.93      0.93      0.93      3317

Accuracy: 0.9255351220982816
Precision: 0.9255351220982816
Recall: 0.9255351220982816
F1-Score: 0.9255351220982816
AUC Score: 0.9212401834363918
MSE: 0.07446487790171842
G-Mean 0.9204308265047134
Kappa: 0.8481833024406144
MCC: 0.8497579500409277
Confusion Matrix:
 [[1301  173]
 [  74 1769]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [23]:
from tensorflow.keras.layers import LSTM

In [24]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.77      0.83      1474
           1       0.83      0.93      0.88      1843

    accuracy                           0.86      3317
   macro avg       0.86      0.85      0.85      3317
weighted avg       0.86      0.86      0.86      3317

Accuracy: 0.8567983117274646
Precision: 0.8567983117274646
Recall: 0.8567983117274646
F1-Score: 0.8567983117274646
AUC Score: 0.847770838502206
MSE: 0.14320168827253543
G-Mean 0.8438780536639093
Kappa: 0.7057511839892936
MCC: 0.7119761881974903
Confusion Matrix:
 [[1130  344]
 [ 131 1712]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
