Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('Mendely_48.csv')
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data pre-processing

In [3]:
Label = LabelEncoder()
df['CLASS_LABEL'] = Label.fit_transform(df['CLASS_LABEL'])
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data Cleaning

In [4]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [5]:
input = df.drop (['CLASS_LABEL'], axis='columns')
target = df.CLASS_LABEL
print(target.head())
input.head()

0    1
1    1
2    1
3    1
4    1
Name: CLASS_LABEL, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,3,1,5,72,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,-1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,0,1,-1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,0,1,0,-1,1,-1,0
3,3,1,6,79,1,0,0,0,0,0,...,1,0,0,0,1,-1,1,1,1,-1
4,3,0,4,46,0,0,0,0,0,0,...,0,1,0,0,1,1,-1,0,-1,-1


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

2000
8000


In [7]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [8]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92       965
           1       0.92      0.94      0.93      1035

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000

Accuracy: 0.9275
Precision: 0.9275
Recall: 0.9275
F1-Score: 0.9275
AUC Score: 0.9269379990488348
MSE: 0.0725
G-Mean 0.926798910967843
Kappa: 0.8546846657246224
MCC: 0.8549977266721391
Confusion Matrix:
 [[879  86]
 [ 59 976]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [9]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       965
           1       0.97      0.97      0.97      1035

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

Accuracy: 0.9645
Precision: 0.9645
Recall: 0.9645
F1-Score: 0.9645
AUC Score: 0.9644739806262672
MSE: 0.0355
G-Mean 0.9644736941180101
Kappa: 0.9289154093371111
MCC: 0.9289158749025515
Confusion Matrix:
 [[930  35]
 [ 36 999]]


Naive Bayes

In [10]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.95      0.85       965
           1       0.94      0.74      0.83      1035

    accuracy                           0.84      2000
   macro avg       0.86      0.84      0.84      2000
weighted avg       0.86      0.84      0.84      2000

Accuracy: 0.841
Precision: 0.841
Recall: 0.841
F1-Score: 0.841
AUC Score: 0.8445896222873019
MSE: 0.159
G-Mean 0.8383393977731673
Kappa: 0.684001848291076
MCC: 0.7003241802287492
Confusion Matrix:
 [[914  51]
 [267 768]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.98      0.98      0.98      1035

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000

Accuracy: 0.9825
Precision: 0.9825
Recall: 0.9825
F1-Score: 0.9825
AUC Score: 0.9824610147430602
MSE: 0.0175
G-Mean 0.9824603833213199
Kappa: 0.9649558443638985
MCC: 0.9649563280603306
Confusion Matrix:
 [[ 947   18]
 [  17 1018]]


LightBoost

In [12]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 3965, number of negative: 4035
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495625 -> initscore=-0.017500
[LightGBM] [Info] Start training from score -0.017500
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       965
           1       0.98      0.99      0.99      1035

    accuracy                           0.98      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.98      0.98      2000

Accuracy: 0.985
Precision: 0.985
Recall: 0.985
F1-Score: 0.985
AUC Score: 0.9849465595354309
MSE: 0.015
G-Mean 0.9849453760526083
Kappa: 0.9699610996240131
MCC: 0.9699630445862376
Confusion Matrix:


CatBoost

In [13]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.025035
0:	learn: 0.6532927	total: 155ms	remaining: 2m 35s
1:	learn: 0.6196353	total: 165ms	remaining: 1m 22s
2:	learn: 0.5854114	total: 187ms	remaining: 1m 2s
3:	learn: 0.5511028	total: 205ms	remaining: 51.1s
4:	learn: 0.5227179	total: 222ms	remaining: 44.2s
5:	learn: 0.4955749	total: 235ms	remaining: 39s
6:	learn: 0.4700847	total: 248ms	remaining: 35.2s
7:	learn: 0.4457968	total: 260ms	remaining: 32.2s
8:	learn: 0.4261674	total: 270ms	remaining: 29.7s
9:	learn: 0.4069228	total: 284ms	remaining: 28.1s
10:	learn: 0.3888188	total: 295ms	remaining: 26.5s
11:	learn: 0.3715027	total: 308ms	remaining: 25.3s
12:	learn: 0.3548817	total: 315ms	remaining: 24s
13:	learn: 0.3399243	total: 328ms	remaining: 23.1s
14:	learn: 0.3265163	total: 338ms	remaining: 22.2s
15:	learn: 0.3140544	total: 350ms	remaining: 21.5s
16:	learn: 0.3008583	total: 361ms	remaining: 20.9s
17:	learn: 0.2902494	total: 372ms	remaining: 20.3s
18:	learn: 0.2802740	total: 383ms	remaining: 19.8s
19:	learn: 0.

Gradient Boost

In [14]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       965
           1       0.97      0.98      0.98      1035

    accuracy                           0.97      2000
   macro avg       0.98      0.97      0.97      2000
weighted avg       0.98      0.97      0.97      2000

Accuracy: 0.975
Precision: 0.975
Recall: 0.975
F1-Score: 0.975
AUC Score: 0.9748992515831895
MSE: 0.025
G-Mean 0.9748950019486252
Kappa: 0.9499316567114111
MCC: 0.9499392770454224
Confusion Matrix:
 [[ 938   27]
 [  23 1012]]


SVM

In [15]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.81      0.85       965
           1       0.84      0.91      0.87      1035

    accuracy                           0.86      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000

Accuracy: 0.8615
Precision: 0.8615
Recall: 0.8615
F1-Score: 0.8615
AUC Score: 0.8596655903481765
MSE: 0.1385
G-Mean 0.8580663963103814
Kappa: 0.7217338891958411
MCC: 0.72504317618695
Confusion Matrix:
 [[779 186]
 [ 91 944]]


KNN

In [16]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86       965
           1       0.85      0.90      0.87      1035

    accuracy                           0.87      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.87      0.87      0.87      2000

Accuracy: 0.8655
Precision: 0.8655
Recall: 0.8655
F1-Score: 0.8655
AUC Score: 0.8641610973442467
MSE: 0.1345
G-Mean 0.8633139672114244
Kappa: 0.7300362293387392
MCC: 0.7316921752658001
Confusion Matrix:
 [[797 168]
 [101 934]]


Deep Learning

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [18]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.95       965
           1       0.94      0.96      0.95      1035

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

Accuracy: 0.948
Precision: 0.948
Recall: 0.948
F1-Score: 0.948
AUC Score: 0.9474806638131711
MSE: 0.052
G-Mean 0.9473644688345095
Kappa: 0.8957774849303263
MCC: 0.896081709623597
Confusion Matrix:
 [[900  65]
 [ 39 996]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [19]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [20]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95       965
           1       0.94      0.98      0.96      1035

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

Accuracy: 0.953
Precision: 0.953
Recall: 0.953
F1-Score: 0.953
AUC Score: 0.952171409977222
MSE: 0.047
G-Mean 0.9518770591159329
Kappa: 0.9057393693562701
MCC: 0.9066222830271625
Confusion Matrix:
 [[ 896   69]
 [  25 1010]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [21]:
from tensorflow.keras.layers import SimpleRNN

In [22]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92       965
           1       0.92      0.94      0.93      1035

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.93      2000
weighted avg       0.93      0.93      0.93      2000

Accuracy: 0.9255
Precision: 0.9255
Recall: 0.9255
F1-Score: 0.9255
AUC Score: 0.9248654601887311
MSE: 0.0745
G-Mean 0.9246877489755957
Kappa: 0.8506550130802153
MCC: 0.8510659453314863
Confusion Matrix:
 [[875  90]
 [ 59 976]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [23]:
from tensorflow.keras.layers import LSTM

In [24]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83       965
           1       0.83      0.87      0.85      1035

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000

Accuracy: 0.838
Precision: 0.838
Recall: 0.838
F1-Score: 0.838
AUC Score: 0.8369953192661009
MSE: 0.162
G-Mean 0.836502945627272
Kappa: 0.6750788485355984
MCC: 0.6757982974182112
Confusion Matrix:
 [[780 185]
 [139 896]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
