In [2]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

In [3]:
import numpy as np
import pandas as pd
import random
import os
#from imutils import paths
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak

2023-04-12 17:25:37.112476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
filename = 'bodmas.npz'
data = np.load('./' + filename)
X = data['X']  # all the feature vectors
y = data['y']  # labels, 0 as benign, 1 as malicious

print(X.shape, y.shape)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train.shape, X_test.shape)


(134435, 2381) (134435,)
(94104, 2381) (40331, 2381)


# Train-Test

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
cv_method = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)
from sklearn.preprocessing import PowerTransformer

params_NB = {'var_smoothing': np.logspace(-10,10, num=2000)}

#gs_NB = GridSearchCV(estimator=GaussianNB(), param_grid=params_NB, cv=cv_method,verbose=1,scoring='accuracy')

classifiers = {"GaussianNB": GaussianNB(), "RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), 
               "DT": DecisionTreeClassifier(), "SGD": SGDClassifier(), 
               "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), 
               "LGBM": LGBMClassifier()} 

print('Starting training...')

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)
    
print("---------------------------")

Starting training...
---------------------------
GaussianNB
Confusion Matrix [[10293 12751]
 [  516 16771]]
accuracy 0.671047085368575
precision 0.671047085368575
recall 0.9701509805055822
f1 0.7165715994787327
---------------------------
RFC
Confusion Matrix [[23006    38]
 [  181 17106]]
accuracy 0.994569933797823
precision 0.994569933797823
recall 0.9895297044021519
f1 0.9936394528186809
---------------------------
SVM
Confusion Matrix [[22581   463]
 [  327 16960]]
accuracy 0.9804120899556172
precision 0.9804120899556172
recall 0.9810840515994678
f1 0.9772399884759435
---------------------------
DT
Confusion Matrix [[22783   261]
 [  196 17091]]
accuracy 0.9886687659616672
precision 0.9886687659616672
recall 0.9886620003470816
f1 0.9868067784866769
---------------------------
SGD
Confusion Matrix [[22826   218]
 [  364 16923]]
accuracy 0.9855694131065433
precision 0.9855694131065433
recall 0.9789437149302944
f1 0.983095155106309
---------------------------
MLP
Confusion Matrix [[22

In [5]:
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=10
)  # It tries 3 different models.
# Feed the structured data classifier with training data.
clf.fit(
    x=X_train,
    y=y_train,
    epochs=15,
)

# Evaluate the best model with testing data.
print(clf.evaluate(x=X_test, y=y_test))

2023-04-12 17:26:20.109640: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model = clf.export_model()
model.summary()