In [1]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

In [13]:
import numpy as np
import random
import copy
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score as acc

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import keras
import autokeras as ak
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.models import Model

In [3]:
filename = 'bodmas.npz'
data = np.load('./' + filename)
X = data['X']  # all the feature vectors
y = data['y']  # labels, 0 as benign, 1 as malicious

print(X.shape, y.shape)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train.shape, X_test.shape)


(134435, 2381) (134435,)
(94104, 2381) (40331, 2381)


# Train-Test

In [5]:
classifiers = {"SGD": SGDClassifier(),"MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

print('Starting training...')

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    
print("---------------------------")

Starting training...
---------------------------
GaussianNB
Confusion Matrix [[10293 12751]
 [  516 16771]]
accuracy 0.671047085368575
precision 0.671047085368575
recall 0.9701509805055822
f1 0.7165715994787327
---------------------------
RFC
Confusion Matrix [[23006    38]
 [  181 17106]]
accuracy 0.994569933797823
precision 0.994569933797823
recall 0.9895297044021519
f1 0.9936394528186809
---------------------------
SVM
Confusion Matrix [[22581   463]
 [  327 16960]]
accuracy 0.9804120899556172
precision 0.9804120899556172
recall 0.9810840515994678
f1 0.9772399884759435
---------------------------
DT
Confusion Matrix [[22783   261]
 [  196 17091]]
accuracy 0.9886687659616672
precision 0.9886687659616672
recall 0.9886620003470816
f1 0.9868067784866769
---------------------------
SGD
Confusion Matrix [[22826   218]
 [  364 16923]]
accuracy 0.9855694131065433
precision 0.9855694131065433
recall 0.9789437149302944
f1 0.983095155106309
---------------------------
MLP
Confusion Matrix [[22

# AutoKeras Model

In [None]:
saved_model=keras.models.load_model('structured_data_classifier/best_model', compile=False)
print(saved_model.summary())

In [16]:
custom_objects = ak.CUSTOM_OBJECTS

saved_model_ak=keras.models.load_model('structured_data_classifier/best_model', custom_objects=custom_objects)
print(saved_model_ak.summary())

TypeError: weight_decay is not a valid argument, kwargs should be empty  for `optimizer_experimental.Optimizer`.

In [8]:
def autoKerasModel(path_best_model):
    saved_model = keras.models.load_model(path_best_model, compile=False)
    input_layer = Input(shape=(2381,))
    x = saved_model.layers[1](input_layer)
    x = saved_model.layers[2](x)
    x = Dense(units=32)(x)
    x = ReLU()(x)
    x = Dense(units=32)(x)
    x = ReLU()(x)
    x = Dense(units=1)(x)
    x = saved_model.layers[-1](x)
    new_model = Model(inputs=input_layer, outputs=x)
    return new_model

In [9]:
autokeras_model=autoKerasModel('structured_data_classifier/best_model')
autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

"""autokeras_model.fit(X_train, y_train, epochs=15)
results = autokeras_model.evaluate(X_test, y_test)

print("test loss, test acc:", results)"""
print(autokeras_model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2381)]            0         
                                                                 
 multi_category_encoding (Mu  (None, 2381)             0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 2381)             4763      
 n)                                                              
                                                                 
 dense (Dense)               (None, 32)                76224     
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056  

# Ensemble Learning - Majority Voting

In [None]:
def majority_voting(y_preds, y_test):
    assert y_preds.shape[0] == len(y_test), "y_preds's length is: {} while y_test's length is: {}. They should be equal.".format(y_preds.shape[0],len(y_test))
    y_pred_vote = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred_vote.append(1)
        else:
            y_pred_vote.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred_vote)
    return accuracy

In [None]:
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0
for classifier_pair in classifiers.items():
    classifier = classifier_pair[1]
    y_preds[i] = classifier.predict(X_test)
    i += 1
y_preds[i] = np.transpose(autokeras_model.predict(X_test, verbose=0))
y_preds = np.transpose(y_preds)
print('accuracy', majority_voting(y_preds, y_test))

# Label Flipping

In [None]:
def attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier_name, classifier):
    flipped_data = random_label_flipping(y_train, per)
    accuracy = 0
    if classifier_name == "AutoKeras":
        classifier.fit(X_train, flipped_data, verbose=0, epochs=15)
        y_pred = np.transpose(classifier.predict(X_test, verbose=0))
        accuracy = classifier.evaluate(X_test, y_test, verbose=0)[1]
    else:
        classifier.fit(X_train, flipped_data)
        y_pred = classifier.predict(X_test)
        accuracy = acc(y_test, y_pred)
    return y_pred, accuracy

In [None]:
#Flipping Random
def random_label_flipping(y_train, per):
    flip_count = int(per*(len(y_train)))
    flipped_data = copy.deepcopy(y_train)
    indices = random.sample(range(len(flipped_data)), flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

#Flipping Specific
def specific_label_flipping(y_train, per, target):
    flipped_data = copy.deepcopy(y_train)
    possible_indices = []
    for i in range(len(y_train)):
        if y_train[i] == target:
            possible_indices.append(i)
    flip_count = int(per*(len(possible_indices)))
    indices = random.sample(possible_indices, flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

In [None]:
percentages = [0.01, 0.05, 0.1, 0.2]

print("---------------------------")
for per in percentages:    
    poisoned_accuracies = {"SGD": [], "MLP": [], "XGB": [], "LGBM": [], "AutoKeras": []}
    ensemble_accuracies = []
    print( per, "poisoning starting...")
    for i in range(5):
        print("Trial #{} is starting...".format(i+1))
        
        poisoned_classifiers = {"SGD": SGDClassifier(), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 
        poisoned_autokeras_model=autoKerasModel('structured_data_classifier/best_model')
        poisoned_autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        poisoned_classifiers["AutoKeras"] = poisoned_autokeras_model
        
        y_preds = np.ndarray(shape=(len(poisoned_classifiers),len(y_test)))
        j=0
        for classifier_pair in poisoned_classifiers.items():
            poisoned_y_pred, poisoned_accuracy = attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier_pair[0], classifier_pair[1])
            y_preds[j] = poisoned_y_pred
            j+=1
            poisoned_accuracies[classifier_pair[0]].append(poisoned_accuracy)
        y_preds = np.transpose(y_preds)
        
        ensemble_accuracy = majority_voting(y_preds, y_test)
        ensemble_accuracies.append(ensemble_accuracy)
        
        print("Trial #{} is completed with accuracy: {}".format(i+1, ensemble_accuracy))
        print("---------------------------")

    print("Random Poisoning - {}".format(per))
    for classifier_pair in poisoned_accuracies.items():
        accuracies = classifier_pair[1]
        print("{}'s average accuracy: {}".format(classifier_pair[0], sum(accuracies)/len(accuracies)))
    print("Average ensemble accuracy:", sum(ensemble_accuracies)/len(ensemble_accuracies))