Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('kaggle_4.csv')
print (df.type.value_counts())
df.head()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


Feature Extraction

In [3]:
df['url_length'] = df['url'].apply(len)
df['num_dots'] = df['url'].apply(lambda x: x.count('.'))
df['num_slash'] = df['url'].apply(lambda x: x.count('/'))
df['num_redir'] = df['url'].apply(lambda x: x.count('//'))
df['num_dash'] = df['url'].apply(lambda x: x.count('-'))
df['contains_anchor'] = df['url'].str.contains('#')
df['has_https'] = df['url'].str.contains("https")
def contains_unicode(url):
    for char in url:
        if ord(char) > 127:
            return True
    return False

df['contains_unicode'] = df['url'].apply(contains_unicode)
df.head()

Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,br-icloud.com.br,phishing,16,2,0,0,1,False,False,False
1,mp3raid.com/music/krizz_kaliko.html,benign,35,2,2,0,0,False,False,False
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,2,3,0,0,False,False,False
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,3,3,1,1,False,False,False
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,2,3,1,1,False,False,False


Data pre-processing

In [4]:
Label = LabelEncoder()
df['type'] = Label.fit_transform(df['type'])
df['contains_anchor'] = Label.fit_transform(df['contains_anchor'])
df['has_https'] = Label.fit_transform(df['has_https'])
df['contains_unicode'] = Label.fit_transform(df['contains_unicode'])
print (df.type.value_counts())
df.head()

type
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64


Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,br-icloud.com.br,3,16,2,0,0,1,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,2,2,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,2,3,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,1,88,3,3,1,1,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,235,2,3,1,1,0,0,0


In [5]:
input = df.drop(['url', 'type'], axis='columns')
target = df.type
print(target.head())
input.head()

0    3
1    0
2    0
3    1
4    1
Name: type, dtype: int32


Unnamed: 0,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,16,2,0,0,1,0,0,0
1,35,2,2,0,0,0,0,0
2,31,2,3,0,0,0,0,0
3,88,3,3,1,1,0,0,0
4,235,2,3,1,1,0,0,0


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

130239
520952


In [7]:
y_test.head()

309265    0
419786    0
120788    0
195617    0
159331    1
Name: type, dtype: int32

In [8]:
y_test.value_counts()

type
0    85354
1    19416
3    18890
2     6579
Name: count, dtype: int64

In [9]:
def report (y_test, y_pred):
    n_classes = 4
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [10]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.96      0.90     85354
           1       0.68      0.91      0.78     19416
           2       0.53      0.17      0.26      6579
           3       0.20      0.05      0.08     18890

    accuracy                           0.78    130239
   macro avg       0.57      0.52      0.50    130239
weighted avg       0.71      0.78      0.73    130239

Accuracy: 0.7812022512457866
Precision: 0.7812022512457866
Recall: 0.7812022512457866
F1-Score: 0.7812022512457866
AUC Score: 0.7035579600966009
MSE: 1.2418707146092953
G-Mean 0.29437852982702906
Kappa: 0.5375571123935086
MCC: 0.5570677533657656
Confusion Matrix:
 [[82006  2349     7   992]
 [ 1027 17663   227   499]
 [  393  2783  1124  2279]
 [14204  2993   743   950]]


Decision Tree

In [11]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95     85354
           1       0.89      0.93      0.91     19416
           2       0.88      0.84      0.86      6579
           3       0.78      0.65      0.71     18890

    accuracy                           0.91    130239
   macro avg       0.87      0.85      0.86    130239
weighted avg       0.90      0.91      0.90    130239

Accuracy: 0.9063567748523867
Precision: 0.9063567748523867
Recall: 0.9063567748523867
F1-Score: 0.9063567748523867
AUC Score: 0.9002499294370192
MSE: 0.6144012162255545
G-Mean 0.8371871927702209
Kappa: 0.8180020835208359
MCC: 0.81902430738266
Confusion Matrix:
 [[82125   468    70  2691]
 [  524 18121   387   384]
 [  142   565  5552   320]
 [ 5098  1241   306 12245]]


Naive Bayes

In [12]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88     85354
           1       0.57      1.00      0.73     19416
           2       0.42      0.20      0.27      6579
           3       0.42      0.02      0.04     18890

    accuracy                           0.76    130239
   macro avg       0.56      0.53      0.48    130239
weighted avg       0.72      0.76      0.70    130239

Accuracy: 0.7620298067399166
Precision: 0.7620298067399166
Recall: 0.7620298067399166
F1-Score: 0.7620298067399167
AUC Score: 0.708999983711036
MSE: 1.2182602753399518
G-Mean 0.2499380774051433
Kappa: 0.5186989753549767
MCC: 0.5402248340058567
Confusion Matrix:
 [[78114  6227   493   520]
 [    2 19399     1    14]
 [  251  4967  1336    25]
 [13966  3170  1357   397]]


Random Forest

In [13]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95     85354
           1       0.89      0.94      0.91     19416
           2       0.89      0.84      0.87      6579
           3       0.78      0.66      0.71     18890

    accuracy                           0.91    130239
   macro avg       0.87      0.85      0.86    130239
weighted avg       0.90      0.91      0.91    130239

Accuracy: 0.9080766897780235
Precision: 0.9080766897780235
Recall: 0.9080766897780235
F1-Score: 0.9080766897780235
AUC Score: 0.9020717360102206
MSE: 0.6075215565230077
G-Mean 0.8403131704617434
Kappa: 0.8214802170983806
MCC: 0.8224199650699227
Confusion Matrix:
 [[82155   466    69  2664]
 [  446 18175   381   414]
 [  133   551  5554   341]
 [ 5044  1213   250 12383]]


LightBoost

In [14]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 356
[LightGBM] [Info] Number of data points in the train set: 520952, number of used features: 8
[LightGBM] [Info] Start training from score -0.418660
[LightGBM] [Info] Start training from score -1.911320
[LightGBM] [Info] Start training from score -2.999833
[LightGBM] [Info] Start training from score -1.935227
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95     85354
           1       0.87      0.93      0.90     19416
           2       0.87      0.80      0.83      6579
           3       0.79      0.63      0.70     18890

    accuracy                           0.90    130239
   macro avg       0.86      0.83      0.85    130239
weighted avg       0.90      0.90      0.90    130239

Accuracy: 0.9017728944478997
Precision: 0.9017728944478997
Recall: 0.90177289

CatBoost

In [15]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.108863
0:	learn: 1.1817204	total: 323ms	remaining: 5m 22s
1:	learn: 1.0413195	total: 538ms	remaining: 4m 28s
2:	learn: 0.9329781	total: 693ms	remaining: 3m 50s
3:	learn: 0.8513895	total: 859ms	remaining: 3m 33s
4:	learn: 0.7851595	total: 1.03s	remaining: 3m 24s
5:	learn: 0.7297806	total: 1.19s	remaining: 3m 17s
6:	learn: 0.6829623	total: 1.39s	remaining: 3m 16s
7:	learn: 0.6451942	total: 1.54s	remaining: 3m 11s
8:	learn: 0.6102464	total: 1.73s	remaining: 3m 10s
9:	learn: 0.5800152	total: 1.89s	remaining: 3m 7s
10:	learn: 0.5540167	total: 2.04s	remaining: 3m 3s
11:	learn: 0.5327822	total: 2.19s	remaining: 3m
12:	learn: 0.5121965	total: 2.35s	remaining: 2m 58s
13:	learn: 0.4943519	total: 2.58s	remaining: 3m 1s
14:	learn: 0.4775750	total: 2.75s	remaining: 3m
15:	learn: 0.4627101	total: 2.9s	remaining: 2m 58s
16:	learn: 0.4504137	total: 3.06s	remaining: 2m 56s
17:	learn: 0.4398433	total: 3.2s	remaining: 2m 54s
18:	learn: 0.4288025	total: 3.35s	remaining: 2m 52s
19:	l

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Gradient Boost

In [16]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94     85354
           1       0.83      0.92      0.87     19416
           2       0.84      0.71      0.77      6579
           3       0.78      0.55      0.65     18890

    accuracy                           0.89    130239
   macro avg       0.84      0.79      0.81    130239
weighted avg       0.88      0.89      0.88    130239

Accuracy: 0.8857945776610693
Precision: 0.8857945776610693
Recall: 0.8857945776610693
F1-Score: 0.8857945776610693
AUC Score: 0.86458143843332
MSE: 0.6830135366518477
G-Mean 0.7674763490320479
Kappa: 0.7749216970846549
MCC: 0.7778592609219939
Confusion Matrix:
 [[82388   758    54  2154]
 [  744 17884   369   419]
 [  277  1213  4639   450]
 [ 6145  1813   478 10454]]


SVM

In [17]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

KNN

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93      2425
           1       0.93      0.93      0.93      2316
           2       0.93      0.94      0.93      2015
           3       0.91      0.86      0.88      2253
           4       0.95      0.93      0.94      2004

    accuracy                           0.92     11013
   macro avg       0.92      0.92      0.92     11013
weighted avg       0.92      0.92      0.92     11013

Accuracy: 0.9224552801234904
Precision: 0.9224552801234904
Recall: 0.9224552801234904
F1-Score: 0.9224552801234904
AUC Score: 0.9514913433912356
MSE: 0.3868155815853991
G-Mean 0.9219299967642602
Kappa: 0.9029051546645267
MCC: 0.9030362736433604
Confusion Matrix:
 [[2305   23   13   65   19]
 [  41 2156   63   31   25]
 [  28   34 1899   39   15]
 [ 116   87   66 1945   39]
 [  47   27   10   66 1854]]


In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93      2425
           1       0.93      0.93      0.93      2316
           2       0.93      0.94      0.93      2015
           3       0.91      0.86      0.88      2253
           4       0.95      0.93      0.94      2004

    accuracy                           0.92     11013
   macro avg       0.92      0.92      0.92     11013
weighted avg       0.92      0.92      0.92     11013

Accuracy: 0.9224552801234904
Precision: 0.9224552801234904
Recall: 0.9224552801234904
F1-Score: 0.9224552801234904
AUC Score: 0.9514913433912356
MSE: 0.3868155815853991
G-Mean 0.9219299967642602
Kappa: 0.9029051546645267
MCC: 0.9030362736433604
Confusion Matrix:
 [[2305   23   13   65   19]
 [  41 2156   63   31   25]
 [  28   34 1899   39   15]
 [ 116   87   66 1945   39]
 [  47   27   10   66 1854]]


Deep Learning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import to_categorical
from numpy import argmax




ANN

In [None]:
num_classes = 5
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94    128083
           1       0.85      0.92      0.88     28986
           2       0.84      0.75      0.79      9944
           3       0.78      0.56      0.65     28345

    accuracy                           0.89    195358
   macro avg       0.85      0.80      0.82    195358
weighted avg       0.88      0.89      0.88    195358

Accuracy: 0.8892648368636042
Precision: 0.8892648368636042
Recall: 0.8892648368636042
F1-Score: 0.8892648368636042
AUC Score: 0.8712021111166058
MSE: 0.6820247955036395
G-Mean 0.7813533650263188
Kappa: 0.7813368448604074
MCC: 0.7843108224189104
Confusion Matrix:
 [[123799   1235    185   2864]
 [   890  26650    552    894]
 [   346   1405   7488    705]
 [  9754   2129    674  15788]]


CNN

In [None]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [None]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94    128083
           1       0.84      0.93      0.88     28986
           2       0.82      0.75      0.79      9944
           3       0.75      0.62      0.68     28345

    accuracy                           0.89    195358
   macro avg       0.84      0.81      0.82    195358
weighted avg       0.89      0.89      0.89    195358

Accuracy: 0.890483113053983
Precision: 0.890483113053983
Recall: 0.890483113053983
F1-Score: 0.890483113053983
AUC Score: 0.8804865058837559
MSE: 0.6684138863010473
G-Mean 0.7999723005376153
Kappa: 0.788028574399831
MCC: 0.7890291010899875
Confusion Matrix:
 [[122071   1142    246   4624]
 [   825  26990    595    576]
 [   349   1536   7456    603]
 [  7649   2496    754  17446]]


RNN

In [None]:
from tensorflow.keras.layers import SimpleRNN

In [None]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94    128083
           1       0.84      0.93      0.88     28986
           2       0.81      0.78      0.79      9944
           3       0.76      0.59      0.66     28345

    accuracy                           0.89    195358
   macro avg       0.83      0.81      0.82    195358
weighted avg       0.89      0.89      0.89    195358

Accuracy: 0.8896333910052314
Precision: 0.8896333910052314
Recall: 0.8896333910052314
F1-Score: 0.8896333910052314
AUC Score: 0.87981477414524
MSE: 0.6803765394813625
G-Mean 0.798327159513908
Kappa: 0.785246943743656
MCC: 0.7867766539716817
Confusion Matrix:
 [[122569   1051    286   4177]
 [   834  26821    909    422]
 [   290   1285   7773    596]
 [  8355   2702    654  16634]]


LSTM

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95    128083
           1       0.87      0.92      0.89     28986
           2       0.86      0.76      0.81      9944
           3       0.77      0.62      0.69     28345

    accuracy                           0.90    195358
   macro avg       0.86      0.82      0.83    195358
weighted avg       0.89      0.90      0.89    195358

Accuracy: 0.8970658995280459
Precision: 0.8970658995280459
Recall: 0.8970658995280459
F1-Score: 0.8970658995280459
AUC Score: 0.8826417573021779
MSE: 0.6444783423253719
G-Mean 0.8045652765215167
Kappa: 0.7985728553906537
MCC: 0.8001006703806073
Confusion Matrix:
 [[123440    707    122   3814]
 [  1022  26652    607    705]
 [   364   1289   7585    706]
 [  8195   2050    528  17572]]
