Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Reading dataset

In [2]:
df = pd.read_csv('kaggle_2.csv')
print (df.status.value_counts())
df.head()

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


data pre-processing

In [3]:
df = df.drop(['url'], axis='columns')
Label = LabelEncoder()
df['status'] = Label.fit_transform(df['status'])
print (df.status.value_counts())
df.head()

status
0    5715
1    5715
Name: count, dtype: int64


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,0
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,1
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,1
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,0
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,0


In [4]:
input = df.drop (['status'], axis='columns')
target = df.status
print(target.head())
input.head()

0    0
1    1
2    1
3    0
4    0
Name: status, dtype: int32


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,37,19,0,3,0,0,0,0,0,0,...,0,0,1,0,45,-1,0,1,1,4
1,77,23,1,1,0,0,0,0,0,0,...,0,1,0,0,77,5767,0,0,1,2
2,126,50,1,4,1,0,1,2,0,3,...,0,1,0,0,14,4004,5828815,0,1,0
3,18,11,0,2,0,0,0,0,0,0,...,0,1,0,0,62,-1,107721,0,0,3
4,55,15,0,2,2,0,0,0,0,0,...,0,0,1,0,224,8175,8725,0,0,6


Data splitting

In [5]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3429
8001


In [6]:
def report (y_test, y_pred):
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred, multi_class='ovo')
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [7]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.82      0.79      1677
           1       0.82      0.74      0.78      1752

    accuracy                           0.78      3429
   macro avg       0.78      0.78      0.78      3429
weighted avg       0.79      0.78      0.78      3429

Accuracy: 0.7821522309711286
Precision: 0.7821522309711286
Recall: 0.7821522309711286
F1-Score: 0.7821522309711286
AUC Score: 0.7830626485651971
MSE: 0.2178477690288714
G-Mean 0.7819555804903965
Kappa: 0.5649787462341629
MCC: 0.5674164437543797
Confusion Matrix:
 [[1383  294]
 [ 453 1299]]


Decision Tree

In [8]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      1677
           1       0.93      0.94      0.94      1752

    accuracy                           0.94      3429
   macro avg       0.94      0.94      0.94      3429
weighted avg       0.94      0.94      0.94      3429

Accuracy: 0.9372995042286381
Precision: 0.9372995042286381
Recall: 0.9372995042286381
F1-Score: 0.9372995042286381
AUC Score: 0.9371737692062637
MSE: 0.06270049577136191
G-Mean 0.937156138125481
Kappa: 0.8745149642992278
MCC: 0.8745484802613667
Confusion Matrix:
 [[1562  115]
 [ 100 1652]]


Naive Bayes

In [9]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.67      0.71      1677
           1       0.72      0.79      0.75      1752

    accuracy                           0.73      3429
   macro avg       0.73      0.73      0.73      3429
weighted avg       0.73      0.73      0.73      3429

Accuracy: 0.731991834354039
Precision: 0.731991834354039
Recall: 0.731991834354039
F1-Score: 0.731991834354039
AUC Score: 0.7307595306360837
MSE: 0.2680081656459609
G-Mean 0.7285843751238638
Kappa: 0.4625403766147448
MCC: 0.4649261133771047
Confusion Matrix:
 [[1131  546]
 [ 373 1379]]


KNN

In [10]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.83      1677
           1       0.83      0.84      0.84      1752

    accuracy                           0.83      3429
   macro avg       0.83      0.83      0.83      3429
weighted avg       0.83      0.83      0.83      3429

Accuracy: 0.8317293671624381
Precision: 0.8317293671624381
Recall: 0.8317293671624381
F1-Score: 0.8317293671624381
AUC Score: 0.8314637603025624
MSE: 0.16827063283756197
G-Mean 0.8313750772039801
Kappa: 0.6631729852385916
MCC: 0.6632680344226243
Confusion Matrix:
 [[1374  303]
 [ 274 1478]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1677
           1       0.97      0.97      0.97      1752

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429

Accuracy: 0.9673374161563137
Precision: 0.9673374161563137
Recall: 0.9673374161563137
F1-Score: 0.9673374161563137
AUC Score: 0.9673345463605101
MSE: 0.032662583843686206
G-Mean 0.9673345374621958
Kappa: 0.9346452343441295
MCC: 0.9346458708401802
Confusion Matrix:
 [[1622   55]
 [  57 1695]]


SVM

In [12]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.30      0.43      1677
           1       0.57      0.89      0.70      1752

    accuracy                           0.60      3429
   macro avg       0.65      0.60      0.56      3429
weighted avg       0.65      0.60      0.57      3429

Accuracy: 0.6030912802566346
Precision: 0.6030912802566346
Recall: 0.6030912802566346
F1-Score: 0.6030912802566346
AUC Score: 0.5966919482768479
MSE: 0.3969087197433654
G-Mean 0.5200381833489641
Kappa: 0.1958156772268148
MCC: 0.23932292716164252
Confusion Matrix:
 [[ 510 1167]
 [ 194 1558]]


Light Boost

In [13]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 3963, number of negative: 4038
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4834
[LightGBM] [Info] Number of data points in the train set: 8001, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495313 -> initscore=-0.018748
[LightGBM] [Info] Start training from score -0.018748
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      1677
           1       0.96      0.97      0.97      1752

    accuracy                           0.96      3429
   macro avg       0.96      0.96      0.96      3429
weighted avg       0.96      0.96      0.96      3429

Accuracy: 0.9644211140274133
Precision: 0.9644211140274133
Recall: 0.9644211140274133
F1-Score: 0.9644211140274133
AUC Score: 0.9643275050849118
MSE: 0.03557888597258676
G-Mean 0.9643180079040136
Ka

Cat Boost

In [14]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.025037
0:	learn: 0.6580371	total: 163ms	remaining: 2m 42s
1:	learn: 0.6236609	total: 169ms	remaining: 1m 24s
2:	learn: 0.5925967	total: 174ms	remaining: 57.8s
3:	learn: 0.5622912	total: 179ms	remaining: 44.6s
4:	learn: 0.5348447	total: 185ms	remaining: 36.8s
5:	learn: 0.5142279	total: 190ms	remaining: 31.5s
6:	learn: 0.4909541	total: 196ms	remaining: 27.7s
7:	learn: 0.4680619	total: 201ms	remaining: 24.9s
8:	learn: 0.4477071	total: 207ms	remaining: 22.8s
9:	learn: 0.4288343	total: 213ms	remaining: 21.1s
10:	learn: 0.4148346	total: 218ms	remaining: 19.6s
11:	learn: 0.3988982	total: 226ms	remaining: 18.6s
12:	learn: 0.3826308	total: 233ms	remaining: 17.7s
13:	learn: 0.3703862	total: 239ms	remaining: 16.8s
14:	learn: 0.3602406	total: 245ms	remaining: 16.1s
15:	learn: 0.3474601	total: 251ms	remaining: 15.4s
16:	learn: 0.3363856	total: 256ms	remaining: 14.8s
17:	learn: 0.3265846	total: 262ms	remaining: 14.3s
18:	learn: 0.3177846	total: 267ms	remaining: 13.8s
19:	learn

Gradient Boost

In [15]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      1677
           1       0.95      0.96      0.96      1752

    accuracy                           0.96      3429
   macro avg       0.96      0.96      0.96      3429
weighted avg       0.96      0.96      0.96      3429

Accuracy: 0.9556722076407116
Precision: 0.9556722076407116
Recall: 0.9556722076407116
F1-Score: 0.9556722076407116
AUC Score: 0.9555871745860596
MSE: 0.04432779235928842
G-Mean 0.9555792661702834
Kappa: 0.9112906602344872
MCC: 0.911306180156549
Confusion Matrix:
 [[1596   81]
 [  71 1681]]
