Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [2]:
df = pd.read_csv('Mendely_48.csv')
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data pre-processing

In [3]:
Label = LabelEncoder()
df['CLASS_LABEL'] = Label.fit_transform(df['CLASS_LABEL'])
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data Cleaning

In [4]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [5]:
input = df.drop (['CLASS_LABEL'], axis='columns')
target = df.CLASS_LABEL
print(target.head())
input.head()

0    1
1    1
2    1
3    1
4    1
Name: CLASS_LABEL, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,3,1,5,72,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,-1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,0,1,-1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,0,1,0,-1,1,-1,0
3,3,1,6,79,1,0,0,0,0,0,...,1,0,0,0,1,-1,1,1,1,-1
4,3,0,4,46,0,0,0,0,0,0,...,0,1,0,0,1,1,-1,0,-1,-1


Data splitting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3000
7000


In [7]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [8]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      1495
           1       0.93      0.94      0.94      1505

    accuracy                           0.93      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.93      0.93      0.93      3000

Accuracy: 0.9346666666666666
Precision: 0.9346666666666666
Recall: 0.9346666666666666
F1-Score: 0.9346666666666666
AUC Score: 0.9346348292758808
MSE: 0.06533333333333333
G-Mean 0.934586025117469
Kappa: 0.8693237504083633
MCC: 0.869475267743398
Confusion Matrix:
 [[1383  112]
 [  84 1421]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [9]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96      1495
           1       0.95      0.97      0.96      1505

    accuracy                           0.96      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000

Accuracy: 0.9626666666666667
Precision: 0.9626666666666667
Recall: 0.9626666666666667
F1-Score: 0.9626666666666667
AUC Score: 0.9626329181435349
MSE: 0.037333333333333336
G-Mean 0.9625796738142334
Kappa: 0.9253275254742035
MCC: 0.9255126753260742
Confusion Matrix:
 [[1424   71]
 [  41 1464]]


Naive Bayes

In [10]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.95      0.86      1495
           1       0.94      0.76      0.84      1505

    accuracy                           0.85      3000
   macro avg       0.87      0.85      0.85      3000
weighted avg       0.87      0.85      0.85      3000

Accuracy: 0.8523333333333334
Precision: 0.8523333333333334
Recall: 0.8523333333333334
F1-Score: 0.8523333333333334
AUC Score: 0.8526483627595861
MSE: 0.14766666666666667
G-Mean 0.8473944252649606
Kappa: 0.704850315359332
MCC: 0.7179113134321949
Confusion Matrix:
 [[1416   79]
 [ 364 1141]]


Random Forest

In [11]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1495
           1       0.98      0.98      0.98      1505

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy: 0.9803333333333333
Precision: 0.9803333333333333
Recall: 0.9803333333333333
F1-Score: 0.9803333333333333
AUC Score: 0.9803364481827577
MSE: 0.019666666666666666
G-Mean 0.9803360028223794
Kappa: 0.9606664918510749
MCC: 0.9606684132069014
Confusion Matrix:
 [[1467   28]
 [  31 1474]]


LightBoost

In [12]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 3495, number of negative: 3505
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1500
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1495
           1       0.98      0.98      0.98      1505

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy: 0.9836666666666667
Precision: 0.9836666666666667
Recall: 0.9836666666666667
F1-Score: 0.9836666666666667
AUC Score: 0.9836631518127978
MSE: 0.01633333333333333
G-Mean 0.9836625866406086
Ka

CatBoost

In [13]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.023648
0:	learn: 0.6559336	total: 144ms	remaining: 2m 23s
1:	learn: 0.6240001	total: 147ms	remaining: 1m 13s
2:	learn: 0.5935116	total: 151ms	remaining: 50.1s
3:	learn: 0.5622281	total: 154ms	remaining: 38.4s
4:	learn: 0.5347079	total: 158ms	remaining: 31.5s
5:	learn: 0.5098712	total: 162ms	remaining: 26.8s
6:	learn: 0.4865857	total: 165ms	remaining: 23.5s
7:	learn: 0.4642373	total: 169ms	remaining: 21s
8:	learn: 0.4443636	total: 173ms	remaining: 19.1s
9:	learn: 0.4241760	total: 177ms	remaining: 17.5s
10:	learn: 0.4056020	total: 181ms	remaining: 16.3s
11:	learn: 0.3872756	total: 184ms	remaining: 15.2s
12:	learn: 0.3706692	total: 188ms	remaining: 14.3s
13:	learn: 0.3549956	total: 193ms	remaining: 13.6s
14:	learn: 0.3412487	total: 196ms	remaining: 12.9s
15:	learn: 0.3291888	total: 200ms	remaining: 12.3s
16:	learn: 0.3159305	total: 205ms	remaining: 11.8s
17:	learn: 0.3052500	total: 210ms	remaining: 11.4s
18:	learn: 0.2944159	total: 214ms	remaining: 11s
19:	learn: 0.

Gradient Boost

In [14]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      1495
           1       0.97      0.98      0.97      1505

    accuracy                           0.97      3000
   macro avg       0.97      0.97      0.97      3000
weighted avg       0.97      0.97      0.97      3000

Accuracy: 0.974
Precision: 0.974
Recall: 0.974
F1-Score: 0.974
AUC Score: 0.973986377626418
MSE: 0.026
G-Mean 0.9739778039494796
Kappa: 0.9479980354813404
MCC: 0.948028375167077
Confusion Matrix:
 [[1450   45]
 [  33 1472]]


SVM

In [15]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.78      0.84      1495
           1       0.81      0.93      0.87      1505

    accuracy                           0.85      3000
   macro avg       0.86      0.85      0.85      3000
weighted avg       0.86      0.85      0.85      3000

Accuracy: 0.855
Precision: 0.855
Recall: 0.855
F1-Score: 0.855
AUC Score: 0.8547506083400928
MSE: 0.145
G-Mean 0.8514698728991756
Kappa: 0.7098529921827059
MCC: 0.717838200608786
Confusion Matrix:
 [[1166  329]
 [ 106 1399]]


KNN

In [16]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.82      0.86      1495
           1       0.83      0.91      0.87      1505

    accuracy                           0.87      3000
   macro avg       0.87      0.87      0.86      3000
weighted avg       0.87      0.87      0.86      3000

Accuracy: 0.8653333333333333
Precision: 0.8653333333333333
Recall: 0.8653333333333333
F1-Score: 0.8653333333333333
AUC Score: 0.8651696129956999
MSE: 0.13466666666666666
G-Mean 0.8637743153423806
Kappa: 0.7305762600339225
MCC: 0.7340640605683448
Confusion Matrix:
 [[1220  275]
 [ 129 1376]]


Deep Learning

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




ANN

In [18]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95      1495
           1       0.93      0.97      0.95      1505

    accuracy                           0.95      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.95      0.95      0.95      3000

Accuracy: 0.9496666666666667
Precision: 0.9496666666666667
Recall: 0.9496666666666667
F1-Score: 0.9496666666666667
AUC Score: 0.9496049956110623
MSE: 0.050333333333333334
G-Mean 0.9494247463440924
Kappa: 0.8993199093212428
MCC: 0.8999252239830551
Confusion Matrix:
 [[1392  103]
 [  48 1457]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


CNN

In [19]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [20]:
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      1495
           1       0.95      0.96      0.96      1505

    accuracy                           0.96      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000

Accuracy: 0.9556666666666667
Precision: 0.9556666666666667
Recall: 0.9556666666666667
F1-Score: 0.9556666666666667
AUC Score: 0.9556561739574885
MSE: 0.044333333333333336
G-Mean 0.955650989691286
Kappa: 0.9113305747289916
MCC: 0.9113469801430497
Confusion Matrix:
 [[1424   71]
 [  62 1443]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


RNN

In [21]:
from tensorflow.keras.layers import SimpleRNN

In [22]:
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      1495
           1       0.92      0.94      0.93      1505

    accuracy                           0.93      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.93      0.93      0.93      3000

Accuracy: 0.927
Precision: 0.927
Recall: 0.927
F1-Score: 0.927
AUC Score: 0.9269602995588839
MSE: 0.073
G-Mean 0.9268837822003703
Kappa: 0.8539870210685394
MCC: 0.8542196316512842
Confusion Matrix:
 [[1368  127]
 [  92 1413]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LSTM

In [23]:
from tensorflow.keras.layers import LSTM

In [24]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Adjust for multiclass

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict (X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
report (y_test, y_pred_binary)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92      1495
           1       0.93      0.90      0.91      1505

    accuracy                           0.92      3000
   macro avg       0.92      0.92      0.92      3000
weighted avg       0.92      0.92      0.92      3000

Accuracy: 0.9156666666666666
Precision: 0.9156666666666666
Recall: 0.9156666666666666
F1-Score: 0.9156666666666666
AUC Score: 0.9157068411871244
MSE: 0.08433333333333333
G-Mean 0.91562752236291
Kappa: 0.8313453265545561
MCC: 0.8315983197497836
Confusion Matrix:
 [[1387  108]
 [ 145 1360]]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
