In [1]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

In [2]:
import numpy as np
import pandas as pd
import random
import os
import copy
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.models import Model

# Original Data

In [3]:
#declear path to your data
drebin_data_path = 'data/drebin.csv'
columns = list(pd.read_csv('data/dataset-features-categories.csv', header = None).iloc[:,0])
# Importing the dataset
Drebin_data = pd.read_csv(drebin_data_path, names = columns)

X = Drebin_data.iloc[:,range(0,Drebin_data.shape[1]-1)].values
y = Drebin_data.iloc[:, -1].values

lb = LabelBinarizer()
y = lb.fit_transform(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train-Test

In [4]:
classifiers = {"SVM": SVC(kernel = 'linear', degree=3), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    
print("---------------------------")

---------------------------
SVM
accuracy 0.9796054090002216
---------------------------
MLP
accuracy 0.9895810241631567
---------------------------
XGB
accuracy 0.9904677455109732
---------------------------
LGBM
accuracy 0.9904677455109732
---------------------------


# AutoKeras Model

In [5]:
saved_model=keras.models.load_model('structured_data_classifier/best_model', compile=True)
print(saved_model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                                                                 
 re_lu (ReLU)                (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                8224  

In [6]:
def autoKerasModel(path_best_model):
    saved_model = keras.models.load_model(path_best_model, compile=True)
    input_layer = Input(shape=(215,))
    x = saved_model.layers[1](input_layer)
    x = saved_model.layers[2](x)
    x = Dense(units=256)(x)
    x = ReLU()(x)
    x = Dense(units=32)(x)
    x = ReLU()(x)
    x = Dense(units=1)(x)
    x = saved_model.layers[-1](x)
    new_model = Model(inputs=input_layer, outputs=x)
    return new_model

In [7]:
autokeras_model=autoKerasModel('structured_data_classifier/best_model')
autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

autokeras_model.fit(X_train, y_train, epochs=15)
results = autokeras_model.evaluate(X_test, y_test)

print("test loss, test acc:", results)
print(autokeras_model.summary())

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss, test acc: [0.060280557721853256, 0.990246057510376]
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                         

# Ensemble Learning - Majority Voting

In [8]:
def majority_voting(y_preds, y_test):
    assert y_preds.shape[0] == len(y_test), "y_preds's length is: {} while y_test's length is: {}. They should be equal.".format(y_preds.shape[0],len(y_test))
    y_pred_vote = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred_vote.append(1)
        else:
            y_pred_vote.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred_vote)
    return accuracy

In [9]:
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0
for classifier_pair in classifiers.items():
    classifier = classifier_pair[1]
    y_preds[i] = classifier.predict(X_test)
    i += 1
y_preds[i] = np.transpose(autokeras_model.predict(X_test))
y_preds = np.transpose(y_preds)
print('accuracy', majority_voting(y_preds, y_test))

accuracy 0.9909111061848814


# Label Flipping

In [13]:
def attack_label_flipping_with_repeat(X_train, X_test, y_train, y_test, per, repeat, classifier, ak=False):
    flip_count = int(per*(len(y_train)))
    y_preds = np.ndarray(shape=(repeat,len(y_test)))
    for i in range(repeat):
        flipped_data = copy.deepcopy(y_train)
        poisoned_classifier = copy.deepcopy(classifier)
        indices = random.sample(range(len(flipped_data)), flip_count)
        for j in indices:
            flipped_data[j] = (flipped_data[j] + 1)%2
        if ak:
            poisoned_classifier.fit(X_train, flipped_data, epochs=15)
            y_preds[i] = np.transpose(poisoned_classifier.predict(X_test))
        else:
            poisoned_classifier.fit(X_train, flipped_data)
            y_preds[i] = poisoned_classifier.predict(X_test)
    y_preds = np.transpose(y_preds)
    avg_preds = []
    for preds in y_preds:
        avg_pred = sum(preds)/len(preds)
        if avg_pred >= 0.5:
            avg_preds.append(1)
        else:
            avg_preds.append(0)
    return avg_preds

def attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier, ak=False):
    flip_count = int(per*(len(y_train)))
    flipped_data = copy.deepcopy(y_train)
    indices = random.sample(range(len(flipped_data)), flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    if ak:
        classifier.fit(X_train, flipped_data, epochs=15)
        y_pred = np.transpose(classifier.predict(X_test))
    else:
        classifier.fit(X_train, flipped_data)
        y_pred = classifier.predict(X_test)
    return y_pred

In [15]:
poisoned_classifiers = [SVC(kernel = 'linear', degree=3), MLPClassifier(random_state=1, max_iter=300), XGBClassifier(), LGBMClassifier()] 
poisoned_autokeras_model=autoKerasModel('structured_data_classifier/best_model')
poisoned_autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
poisoned_classifiers.append(poisoned_autokeras_model)
percentages = [0.05, 0.1, 0.2, 0.4]

accuracies = []
for i in range(5):
    y_preds = np.ndarray(shape=(len(classifiers),len(y_test)))
    ak = False
    for j in range(len(classifiers)):
        if j == len(classifiers) - 1:
            ak = True
        y_preds[j] = attack_label_flipping(X_train, X_test, y_train, y_test, percentages[2], poisoned_classifiers[j], ak)
    y_preds = np.transpose(y_preds)
    accuracies.append(majority_voting(y_preds, y_test))
print(accuracies)
print("Average accuracy ", sum(accuracies)/len(accuracies))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[0.9769452449567724, 0.9756151629350477, 0.9758368432720018, 0.9765018842828641, 0.9769452449567724]
Average accuracy  0.9763688760806917
