Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('kaggle_4.csv')
print (df.type.value_counts())
df.head()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


Feature Extraction

In [3]:
df['url_length'] = df['url'].apply(len)
df['num_dots'] = df['url'].apply(lambda x: x.count('.'))
df['num_slash'] = df['url'].apply(lambda x: x.count('/'))
df['num_redir'] = df['url'].apply(lambda x: x.count('//'))
df['num_dash'] = df['url'].apply(lambda x: x.count('-'))
df['contains_anchor'] = df['url'].str.contains('#')
df['has_https'] = df['url'].str.contains("https")
def contains_unicode(url):
    for char in url:
        if ord(char) > 127:
            return True
    return False

df['contains_unicode'] = df['url'].apply(contains_unicode)
df.head()

Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,br-icloud.com.br,phishing,16,2,0,0,1,False,False,False
1,mp3raid.com/music/krizz_kaliko.html,benign,35,2,2,0,0,False,False,False
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,2,3,0,0,False,False,False
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,3,3,1,1,False,False,False
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,2,3,1,1,False,False,False


Data pre-processing

In [18]:
Label = LabelEncoder()
df['type'] = Label.fit_transform(df['type'])
df['contains_anchor'] = Label.fit_transform(df['contains_anchor'])
df['has_https'] = Label.fit_transform(df['has_https'])
df['contains_unicode'] = Label.fit_transform(df['contains_unicode'])
print (df.type.value_counts())
df.head()

type
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64


Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,br-icloud.com.br,3,16,2,0,0,1,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,2,2,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,2,3,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,1,88,3,3,1,1,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,235,2,3,1,1,0,0,0


In [19]:
input = df.drop(['url', 'type'], axis='columns')
target = df.type
print(target.head())
input.head()

0    3
1    0
2    0
3    1
4    1
Name: type, dtype: int64


Unnamed: 0,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,16,2,0,0,1,0,0,0
1,35,2,2,0,0,0,0,0
2,31,2,3,0,0,0,0,0
3,88,3,3,1,1,0,0,0
4,235,2,3,1,1,0,0,0


Data splitting

In [20]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

195358
455833


In [21]:
y_test.head()

621854    3
535395    2
87566     0
166007    0
376663    0
Name: type, dtype: int64

In [22]:
y_test.value_counts()

type
0    128083
1     28986
3     28345
2      9944
Name: count, dtype: int64

In [23]:
def report (y_test, y_pred):
    n_classes = 4
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [19]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.96      0.90    128652
           1       0.68      0.91      0.78     28833
           2       0.59      0.09      0.16      9817
           3       0.21      0.06      0.10     28056

    accuracy                           0.78    195358
   macro avg       0.58      0.51      0.48    195358
weighted avg       0.71      0.78      0.73    195358

Accuracy: 0.7815344137429744
Precision: 0.7815344137429744
Recall: 0.7815344137429744
F1-Score: 0.7815344137429744
AUC Score: 0.6949359664041659
MSE: 1.2264969952599842
G-Mean 0.2665064141912997
Kappa: 0.5337921893932194
MCC: 0.5527124646410064
Confusion Matrix:
 [[123877   3837      1    937]
 [  1335  26132      0   1366]
 [   550   3903    899   4465]
 [ 21340   4320    625   1771]]


Decision Tree

In [20]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95    128652
           1       0.88      0.94      0.91     28833
           2       0.89      0.84      0.86      9817
           3       0.78      0.64      0.70     28056

    accuracy                           0.91    195358
   macro avg       0.87      0.84      0.86    195358
weighted avg       0.90      0.91      0.90    195358

Accuracy: 0.9060289315001178
Precision: 0.9060289315001178
Recall: 0.9060289315001178
F1-Score: 0.9060289315001178
AUC Score: 0.8986302578583822
MSE: 0.6193245221593178
G-Mean 0.8339364658822773
Kappa: 0.8159541956590018
MCC: 0.8171031298387016
Confusion Matrix:
 [[123852    730    128   3942]
 [   760  26961    540    572]
 [   208    866   8250    493]
 [  7819   1940    360  17937]]


Naive Bayes

In [21]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88    128652
           1       0.57      1.00      0.73     28833
           2       0.42      0.22      0.29      9817
           3       0.38      0.02      0.04     28056

    accuracy                           0.76    195358
   macro avg       0.55      0.54      0.48    195358
weighted avg       0.72      0.76      0.71    195358

Accuracy: 0.7621597272699352
Precision: 0.7621597272699352
Recall: 0.7621597272699352
F1-Score: 0.7621597272699352
AUC Score: 0.7100899737250017
MSE: 1.2147442131880957
G-Mean 0.2463543814656348
Kappa: 0.5169939519822138
MCC: 0.5378731232801154
Confusion Matrix:
 [[117422   9527    880    823]
 [     2  28813      1     17]
 [   354   7301   2139     23]
 [ 20851   4567   2118    520]]


Random Forest

In [22]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95    128652
           1       0.89      0.94      0.91     28833
           2       0.89      0.84      0.87      9817
           3       0.78      0.65      0.71     28056

    accuracy                           0.91    195358
   macro avg       0.87      0.85      0.86    195358
weighted avg       0.90      0.91      0.90    195358

Accuracy: 0.90743148476131
Precision: 0.90743148476131
Recall: 0.90743148476131
F1-Score: 0.90743148476131
AUC Score: 0.9010746484293828
MSE: 0.6135505072738255
G-Mean 0.8384754568080508
Kappa: 0.8191805690965079
MCC: 0.8200832805202957
Confusion Matrix:
 [[123706    743    115   4088]
 [   638  27021    554    620]
 [   193    839   8262    523]
 [  7582   1878    311  18285]]


LightBoost

In [23]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 357
[LightGBM] [Info] Number of data points in the train set: 455833, number of used features: 8
[LightGBM] [Info] Start training from score -0.420176
[LightGBM] [Info] Start training from score -1.908164
[LightGBM] [Info] Start training from score -2.999629
[LightGBM] [Info] Start training from score -1.931639
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95    128652
           1       0.87      0.93      0.90     28833
           2       0.87      0.80      0.83      9817
           3       0.79      0.62      0.70     28056

    accuracy                           0.90    195358
   macro avg       0.87      0.83      0.84    195358
weighted avg       0.90      0.90      0.90    195358

Accuracy: 0.9023433900838461
Precision: 0.9023433900838461
Recall: 0.9023433900838461
F1-Score: 0.9023433900838461
AUC Score: 0.8902102040831

CatBoost

In [24]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.108124
0:	learn: 1.1833323	total: 238ms	remaining: 3m 57s
1:	learn: 1.0436715	total: 313ms	remaining: 2m 35s
2:	learn: 0.9358597	total: 387ms	remaining: 2m 8s
3:	learn: 0.8543941	total: 468ms	remaining: 1m 56s
4:	learn: 0.7883709	total: 543ms	remaining: 1m 48s
5:	learn: 0.7331507	total: 616ms	remaining: 1m 42s
6:	learn: 0.6861696	total: 692ms	remaining: 1m 38s
7:	learn: 0.6480250	total: 765ms	remaining: 1m 34s
8:	learn: 0.6131779	total: 843ms	remaining: 1m 32s
9:	learn: 0.5856698	total: 911ms	remaining: 1m 30s
10:	learn: 0.5595346	total: 986ms	remaining: 1m 28s
11:	learn: 0.5366510	total: 1.06s	remaining: 1m 26s
12:	learn: 0.5160331	total: 1.13s	remaining: 1m 25s
13:	learn: 0.4972156	total: 1.21s	remaining: 1m 25s
14:	learn: 0.4811444	total: 1.28s	remaining: 1m 23s
15:	learn: 0.4666423	total: 1.35s	remaining: 1m 23s
16:	learn: 0.4532913	total: 1.42s	remaining: 1m 22s
17:	learn: 0.4425448	total: 1.5s	remaining: 1m 21s
18:	learn: 0.4319560	total: 1.57s	remaining: 1

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Gradient Boost

In [25]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94    128652
           1       0.83      0.93      0.88     28833
           2       0.84      0.69      0.76      9817
           3       0.78      0.56      0.65     28056

    accuracy                           0.89    195358
   macro avg       0.84      0.79      0.81    195358
weighted avg       0.88      0.89      0.88    195358

Accuracy: 0.8879185904851605
Precision: 0.8879185904851605
Recall: 0.8879185904851605
F1-Score: 0.8879185904851605
AUC Score: 0.8650035418503412
MSE: 0.6718742001863246
G-Mean 0.7672938124364459
Kappa: 0.777728120517402
MCC: 0.7806200938281942
Confusion Matrix:
 [[124268   1068     55   3261]
 [  1063  26765    507    498]
 [   468   1789   6822    738]
 [  9020   2683    746  15607]]


SVM

In [26]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

KNN

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93      2425
           1       0.93      0.93      0.93      2316
           2       0.93      0.94      0.93      2015
           3       0.91      0.86      0.88      2253
           4       0.95      0.93      0.94      2004

    accuracy                           0.92     11013
   macro avg       0.92      0.92      0.92     11013
weighted avg       0.92      0.92      0.92     11013

Accuracy: 0.9224552801234904
Precision: 0.9224552801234904
Recall: 0.9224552801234904
F1-Score: 0.9224552801234904
AUC Score: 0.9514913433912356
MSE: 0.3868155815853991
G-Mean 0.9219299967642602
Kappa: 0.9029051546645267
MCC: 0.9030362736433604
Confusion Matrix:
 [[2305   23   13   65   19]
 [  41 2156   63   31   25]
 [  28   34 1899   39   15]
 [ 116   87   66 1945   39]
 [  47   27   10   66 1854]]


In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93      2425
           1       0.93      0.93      0.93      2316
           2       0.93      0.94      0.93      2015
           3       0.91      0.86      0.88      2253
           4       0.95      0.93      0.94      2004

    accuracy                           0.92     11013
   macro avg       0.92      0.92      0.92     11013
weighted avg       0.92      0.92      0.92     11013

Accuracy: 0.9224552801234904
Precision: 0.9224552801234904
Recall: 0.9224552801234904
F1-Score: 0.9224552801234904
AUC Score: 0.9514913433912356
MSE: 0.3868155815853991
G-Mean 0.9219299967642602
Kappa: 0.9029051546645267
MCC: 0.9030362736433604
Confusion Matrix:
 [[2305   23   13   65   19]
 [  41 2156   63   31   25]
 [  28   34 1899   39   15]
 [ 116   87   66 1945   39]
 [  47   27   10   66 1854]]


Deep Learning

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import to_categorical
from numpy import argmax




ANN

In [24]:
num_classes = 5
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94    128083
           1       0.85      0.92      0.88     28986
           2       0.84      0.75      0.79      9944
           3       0.78      0.56      0.65     28345

    accuracy                           0.89    195358
   macro avg       0.85      0.80      0.82    195358
weighted avg       0.88      0.89      0.88    195358

Accuracy: 0.8892648368636042
Precision: 0.8892648368636042
Recall: 0.8892648368636042
F1-Score: 0.8892648368636042
AUC Score: 0.8712021111166058
MSE: 0.6820247955036395
G-Mean 0.7813533650263188
Kappa: 0.7813368448604074
MCC: 0.7843108224189104
Confusion Matrix:
 [[123799   1235    185   2864]
 [   890  26650    552    894]
 [   346   1405   7488    705]
 [  9754   2129    674  15788]]


CNN

In [25]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [26]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94    128083
           1       0.84      0.93      0.88     28986
           2       0.82      0.75      0.79      9944
           3       0.75      0.62      0.68     28345

    accuracy                           0.89    195358
   macro avg       0.84      0.81      0.82    195358
weighted avg       0.89      0.89      0.89    195358

Accuracy: 0.890483113053983
Precision: 0.890483113053983
Recall: 0.890483113053983
F1-Score: 0.890483113053983
AUC Score: 0.8804865058837559
MSE: 0.6684138863010473
G-Mean 0.7999723005376153
Kappa: 0.788028574399831
MCC: 0.7890291010899875
Confusion Matrix:
 [[122071   1142    246   4624]
 [   825  26990    595    576]
 [   349   1536   7456    603]
 [  7649   2496    754  17446]]


RNN

In [27]:
from tensorflow.keras.layers import SimpleRNN

In [28]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94    128083
           1       0.84      0.93      0.88     28986
           2       0.81      0.78      0.79      9944
           3       0.76      0.59      0.66     28345

    accuracy                           0.89    195358
   macro avg       0.83      0.81      0.82    195358
weighted avg       0.89      0.89      0.89    195358

Accuracy: 0.8896333910052314
Precision: 0.8896333910052314
Recall: 0.8896333910052314
F1-Score: 0.8896333910052314
AUC Score: 0.87981477414524
MSE: 0.6803765394813625
G-Mean 0.798327159513908
Kappa: 0.785246943743656
MCC: 0.7867766539716817
Confusion Matrix:
 [[122569   1051    286   4177]
 [   834  26821    909    422]
 [   290   1285   7773    596]
 [  8355   2702    654  16634]]


LSTM

In [29]:
from tensorflow.keras.layers import LSTM

In [30]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred2= argmax(y_pred, axis=1)
report(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95    128083
           1       0.87      0.92      0.89     28986
           2       0.86      0.76      0.81      9944
           3       0.77      0.62      0.69     28345

    accuracy                           0.90    195358
   macro avg       0.86      0.82      0.83    195358
weighted avg       0.89      0.90      0.89    195358

Accuracy: 0.8970658995280459
Precision: 0.8970658995280459
Recall: 0.8970658995280459
F1-Score: 0.8970658995280459
AUC Score: 0.8826417573021779
MSE: 0.6444783423253719
G-Mean 0.8045652765215167
Kappa: 0.7985728553906537
MCC: 0.8001006703806073
Confusion Matrix:
 [[123440    707    122   3814]
 [  1022  26652    607    705]
 [   364   1289   7585    706]
 [  8195   2050    528  17572]]
