Importing libraries

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



Reading dataset

In [32]:
df = pd.read_csv('Mendely_48.csv')
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data pre-processing

In [33]:
Label = LabelEncoder()
df['CLASS_LABEL'] = Label.fit_transform(df['CLASS_LABEL'])
print (df.CLASS_LABEL.value_counts())
df.head()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,72,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,79,1,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,46,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


Data Cleaning

In [34]:
# df = df.replace([np.inf, -np.inf], np.nan)
# df.fillna(0, inplace=True)
# print(df.max())
# print(df.min())
# df.head()

In [35]:
input = df.drop (['CLASS_LABEL'], axis='columns')
target = df.CLASS_LABEL
print(target.head())
input.head()

0    1
1    1
2    1
3    1
4    1
Name: CLASS_LABEL, dtype: int64


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,3,1,5,72,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,-1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,0,1,-1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,0,1,0,-1,1,-1,0
3,3,1,6,79,1,0,0,0,0,0,...,1,0,0,0,1,-1,1,1,1,-1
4,3,0,4,46,0,0,0,0,0,0,...,0,1,0,0,1,1,-1,0,-1,-1


Data splitting

In [36]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.3)
print (len (X_test))
print (len (X_train))

3000
7000


In [37]:
def report (y_test, y_pred):
    n_classes = 2
    y_true_binary = label_binarize(y_test, classes=range(n_classes))
    y_pred_binary = label_binarize(y_pred, classes=range(n_classes))
    auc_score = roc_auc_score(y_true_binary, y_pred_binary, average='macro')

    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    g_mean = geometric_mean_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("AUC Score:", auc_score)
    print("MSE:", mse)
    print("G-Mean", g_mean)
    print("Kappa:", kappa)
    print("MCC:", mcc)
    print("Confusion Matrix:\n", conf_matrix)


Logistic Regression

In [38]:
model = LogisticRegression ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      1489
           1       0.94      0.94      0.94      1511

    accuracy                           0.94      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.94      0.94      0.94      3000

Accuracy: 0.9373333333333334
Precision: 0.9373333333333334
Recall: 0.9373333333333334
F1-Score: 0.9373333333333334
AUC Score: 0.9373397413816477
MSE: 0.06266666666666666
G-Mean 0.9373393340747443
Kappa: 0.874662377334691
MCC: 0.8746654874748216
Confusion Matrix:
 [[1397   92]
 [  96 1415]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [39]:
model = DecisionTreeClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      1489
           1       0.96      0.97      0.96      1511

    accuracy                           0.96      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000

Accuracy: 0.964
Precision: 0.964
Recall: 0.964
F1-Score: 0.964
AUC Score: 0.963968728984981
MSE: 0.036
G-Mean 0.9639592972782717
Kappa: 0.9279919030895474
MCC: 0.928021606935928
Confusion Matrix:
 [[1429   60]
 [  48 1463]]


Naive Bayes

In [40]:
model = GaussianNB ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.96      0.87      1489
           1       0.95      0.76      0.84      1511

    accuracy                           0.86      3000
   macro avg       0.87      0.86      0.86      3000
weighted avg       0.87      0.86      0.86      3000

Accuracy: 0.859
Precision: 0.859
Recall: 0.859
F1-Score: 0.859
AUC Score: 0.859728234273932
MSE: 0.141
G-Mean 0.8539737809707818
Kappa: 0.7183992472893986
MCC: 0.7332716627536157
Confusion Matrix:
 [[1428   61]
 [ 362 1149]]


Random Forest

In [41]:
model = RandomForestClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1489
           1       0.98      0.98      0.98      1511

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy: 0.9793333333333333
Precision: 0.9793333333333333
Recall: 0.9793333333333333
F1-Score: 0.9793333333333333
AUC Score: 0.9793224435625204
MSE: 0.020666666666666667
G-Mean 0.9793213177160425
Kappa: 0.9586636353332578
MCC: 0.9586670444332371
Confusion Matrix:
 [[1456   33]
 [  29 1482]]


LightBoost

In [42]:
model = LGBMClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

[LightGBM] [Info] Number of positive: 3489, number of negative: 3511
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1495
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498429 -> initscore=-0.006286
[LightGBM] [Info] Start training from score -0.006286
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1489
           1       0.98      0.98      0.98      1511

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy: 0.983
Precision: 0.983
Recall: 0.983
F1-Score: 0.983
AUC Score: 0.9829868628490688
MSE: 0.017
G-Mean 0.9829852304656376
Kappa: 0.9659973402363918
MCC: 0.9660027077726998
Confusion Matrix:


CatBoost

In [43]:
model = CatBoostClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Learning rate set to 0.023648


0:	learn: 0.6546390	total: 139ms	remaining: 2m 18s
1:	learn: 0.6191639	total: 142ms	remaining: 1m 11s
2:	learn: 0.5858112	total: 145ms	remaining: 48.3s
3:	learn: 0.5569760	total: 148ms	remaining: 36.9s
4:	learn: 0.5288879	total: 153ms	remaining: 30.4s
5:	learn: 0.5022396	total: 156ms	remaining: 25.9s
6:	learn: 0.4787766	total: 159ms	remaining: 22.6s
7:	learn: 0.4592533	total: 163ms	remaining: 20.2s
8:	learn: 0.4406067	total: 167ms	remaining: 18.4s
9:	learn: 0.4196678	total: 170ms	remaining: 16.9s
10:	learn: 0.4013790	total: 173ms	remaining: 15.6s
11:	learn: 0.3833537	total: 176ms	remaining: 14.5s
12:	learn: 0.3676084	total: 179ms	remaining: 13.6s
13:	learn: 0.3514670	total: 183ms	remaining: 12.9s
14:	learn: 0.3384436	total: 186ms	remaining: 12.2s
15:	learn: 0.3255575	total: 189ms	remaining: 11.6s
16:	learn: 0.3143702	total: 193ms	remaining: 11.1s
17:	learn: 0.3033587	total: 196ms	remaining: 10.7s
18:	learn: 0.2938405	total: 199ms	remaining: 10.3s
19:	learn: 0.2837970	total: 203ms	remai

Gradient Boost

In [44]:
model = GradientBoostingClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1489
           1       0.98      0.98      0.98      1511

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy: 0.9783333333333334
Precision: 0.9783333333333334
Recall: 0.9783333333333334
F1-Score: 0.9783333333333334
AUC Score: 0.9783101669023091
MSE: 0.021666666666666667
G-Mean 0.9783050664348382
Kappa: 0.9566624292153011
MCC: 0.9566796529721063
Confusion Matrix:
 [[1452   37]
 [  28 1483]]


SVM

In [45]:
model = SVC ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.78      0.83      1489
           1       0.81      0.91      0.86      1511

    accuracy                           0.84      3000
   macro avg       0.85      0.84      0.84      3000
weighted avg       0.85      0.84      0.84      3000

Accuracy: 0.845
Precision: 0.845
Recall: 0.845
F1-Score: 0.845
AUC Score: 0.8445149716940333
MSE: 0.155
G-Mean 0.8419210225700106
Kappa: 0.689687480814011
MCC: 0.6956023251931582
Confusion Matrix:
 [[1159  330]
 [ 135 1376]]


KNN

In [46]:
X_train = np.array(X_train)
X_test = np.array(X_test)
model = KNeighborsClassifier ()
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.82      0.85      1489
           1       0.83      0.90      0.87      1511

    accuracy                           0.86      3000
   macro avg       0.86      0.86      0.86      3000
weighted avg       0.86      0.86      0.86      3000

Accuracy: 0.8596666666666667
Precision: 0.8596666666666667
Recall: 0.8596666666666667
F1-Score: 0.8596666666666667
AUC Score: 0.8593731040647075
MSE: 0.14033333333333334
G-Mean 0.8584402308257092
Kappa: 0.7191575883930922
MCC: 0.7213580648203508
Confusion Matrix:
 [[1220  269]
 [ 152 1359]]


XGBoost

In [None]:
model = XGBClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    objective='multi:softprob',  
    num_class=5,  
    use_label_encoder=False
)
model.fit (X_train, y_train)
y_pred = model.predict (X_test)
report (y_test, y_pred)

: 