In [1]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

In [2]:
import numpy as np
import pandas as pd
import random
import os
import copy
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.models import Model

# Original Data

In [3]:
#declear path to your data
drebin_data_path = 'data/drebin.csv'
columns = list(pd.read_csv('data/dataset-features-categories.csv', header = None).iloc[:,0])
# Importing the dataset
Drebin_data = pd.read_csv(drebin_data_path, names = columns)

X = Drebin_data.iloc[:,range(0,Drebin_data.shape[1]-1)].values
y = Drebin_data.iloc[:, -1].values

lb = LabelBinarizer()
y = lb.fit_transform(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train-Test

In [4]:
classifiers = {"SVM": SVC(kernel = 'linear', degree=3), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    
print("---------------------------")

---------------------------
SVM
accuracy 0.9796054090002216
---------------------------
MLP
accuracy 0.9895810241631567
---------------------------
XGB
accuracy 0.9904677455109732
---------------------------
LGBM
accuracy 0.9904677455109732
---------------------------


# AutoKeras Model

In [5]:
saved_model=keras.models.load_model('structured_data_classifier/best_model', compile=True)
print(saved_model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                                                                 
 re_lu (ReLU)                (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                8224  

In [11]:
def autoKerasModel(path_best_model):
    saved_model = keras.models.load_model(path_best_model, compile=True)
    input_layer = Input(shape=(215,))
    x = saved_model.layers[1](input_layer)
    x = saved_model.layers[2](x)
    x = Dense(units=256)(x)
    x = ReLU()(x)
    x = Dense(units=32)(x)
    x = ReLU()(x)
    x = Dense(units=1)(x)
    x = saved_model.layers[-1](x)
    new_model = Model(inputs=input_layer, outputs=x)
    return new_model

In [7]:
autokeras_model=autoKerasModel('structured_data_classifier/best_model')
autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

autokeras_model.fit(X_train, y_train, epochs=15)
results = autokeras_model.evaluate(X_test, y_test)

print("test loss, test acc:", results)
print(autokeras_model.summary())

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss, test acc: [0.07863331586122513, 0.9909111261367798]
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                         

# Ensemble Learning - Majority Voting

In [14]:
def majority_voting(y_preds, y_test):
    assert y_preds.shape[0] == len(y_test), "y_preds's length is: {} while y_test's length is: {}. They should be equal.".format(y_preds.shape[0],len(y_test))
    y_pred_vote = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred_vote.append(1)
        else:
            y_pred_vote.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred_vote)
    return accuracy

In [9]:
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0
for classifier_pair in classifiers.items():
    classifier = classifier_pair[1]
    y_preds[i] = classifier.predict(X_test)
    i += 1
y_preds[i] = np.transpose(autokeras_model.predict(X_test, verbose=0))
y_preds = np.transpose(y_preds)
print('accuracy', majority_voting(y_preds, y_test))

accuracy 0.9906894258479273


# Label Flipping

In [6]:
def attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier, ak=False):
    flip_count = int(per*(len(y_train)))
    flipped_data = copy.deepcopy(y_train)
    indices = random.sample(range(len(flipped_data)), flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    if ak:
        classifier.fit(X_train, flipped_data, verbose=0, epochs=15)
        y_pred = np.transpose(classifier.predict(X_test, verbose=0))
    else:
        classifier.fit(X_train, flipped_data)
        y_pred = classifier.predict(X_test)
    return y_pred

In [7]:
percentages = [0.05, 0.1, 0.2, 0.4]

accuracies = []
print("---------------------------")
for i in range(5):
    print("Trial #{} is starting...".format(i+1))
    poisoned_classifiers = [SVC(kernel = 'linear', degree=3), MLPClassifier(random_state=1, max_iter=300), XGBClassifier(), LGBMClassifier()] 
    poisoned_autokeras_model=autoKerasModel('structured_data_classifier/best_model')
    poisoned_autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    poisoned_classifiers.append(poisoned_autokeras_model)
    y_preds = np.ndarray(shape=(len(poisoned_classifiers),len(y_test)))
    ak = False
    for j in range(len(poisoned_classifiers)):
        if j == len(poisoned_classifiers) - 1:
            ak = True
        y_preds[j] = attack_label_flipping(X_train, X_test, y_train, y_test, percentages[2], poisoned_classifiers[j], ak)
    y_preds = np.transpose(y_preds)
    accuracy = majority_voting(y_preds, y_test)
    accuracies.append(accuracy)
    print("Trial #{} is completed with accuracy: {}".format(i+1, accuracy))
    print("---------------------------")
print("Average accuracy is", sum(accuracies)/len(accuracies))

---------------------------
Trial #1 is starting...
Trial #1 is completed with accuracy: 0.9731766792285524
---------------------------
Trial #2 is starting...
Trial #2 is completed with accuracy: 0.9731766792285524
---------------------------
Trial #3 is starting...
Trial #3 is completed with accuracy: 0.9767235646198182
---------------------------
Trial #4 is starting...
Trial #4 is completed with accuracy: 0.9740634005763689
---------------------------
Trial #5 is starting...
Trial #5 is completed with accuracy: 0.9720682775437819
---------------------------
Average accuracy is 0.9738417202394147


# SECML Poisoning

In [17]:
from secml.ml.classifiers.sklearn.c_classifier_logistic import CClassifierLogistic
from secml.adv.attacks.poisoning.c_attack_poisoning_logistic_regression import CAttackPoisoningLogisticRegression
from secml.data.c_dataset import CDataset
from secml.ml.peval.metrics import CMetricAccuracy

lob, ub = 0., 1.  # Bounds of the attack space. Can be set to `None` for unbounded

# Should be chosen depending on the optimization problem
solver_params = {
    'eta': 0.25,
    'eta_min': 2.0,
    'eta_max': None,
    'max_iter': 100,
    'eps': 1e-6
}

dataset = CDataset(X_train, y_train)
metric = CMetricAccuracy()

# train SVM in the dual space, on a linear kernel, as needed for poisoning
c_classifier_LR = CClassifierLogistic()
print("Training of c_classifier_LR...")
c_classifier_LR.fit(X_train, y_train)
print("Training of c_classifier_LR is done!")

Training of c_classifier_LR...
Training of c_classifier_LR is done!


# 10% Poisoning

In [18]:
poisoning_strength = 0.1 # Percentage of points to poison in training data
n_poisoning_points = int(X_train.shape[0] * poisoning_strength)  # Number of poisoning points to generate

pois_attack = CAttackPoisoningLogisticRegression(classifier=c_classifier_LR,
                                training_data=dataset,
                                val=dataset,
                                lb=lob, 
                                ub=ub,
                                solver_params=solver_params)
pois_attack.n_points = n_poisoning_points

# Run the poisoning attack
print("Attack started...")
pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
print("Attack complete!")

Attack started...


KeyboardInterrupt: 

In [16]:
pois_X_train = copy.deepcopy(X_train)
pois_y_train = copy.deepcopy(y_train)

for i in range(n_poisoning_points):
    pois_X_train = np.append(pois_X_train, pois_points_ds.X[i,:].tolist(), axis=0)
    pois_y_train = np.append(pois_y_train, pois_points_ds.Y[i].tolist())
pois_y_train = lb.fit_transform(pois_y_train)


#Train the classifiers with poisoned datasets
classifiers_secml = {"SVM": SVC(kernel = 'linear', degree=3), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0

for classifier_pair in classifiers_secml.items():
    print(classifier_pair[0])
    classifier = classifier_pair[1]
    classifier.fit(pois_X_train, pois_y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    y_preds[i] = y_pred
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    i += 1
    print("---------------------------")

ak_model_secml=autoKerasModel('structured_data_classifier/best_model')
ak_model_secml.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

ak_model_secml.fit(pois_X_train, pois_y_train, epochs=15, verbose=0)

y_preds[i] = np.transpose(ak_model_secml.predict(X_test, verbose=0))
print("AutoKeras Model")
print('accuracy', ak_model_secml.evaluate(X_test, y_test, verbose=0)[1])
print("---------------------------")
y_preds = np.transpose(y_preds)
print('Ensemble accuracy', majority_voting(y_preds, y_test))

SVM
accuracy 0.9793837286632676
---------------------------
MLP
accuracy 0.9891376634892485
---------------------------
XGB
accuracy 0.9900243848370649
---------------------------
LGBM
accuracy 0.9909111061848814
---------------------------
AutoKeras Model
accuracy 0.9858124852180481
---------------------------
Ensemble accuracy 0.9909111061848814


# 20% Poisoning

In [None]:
poisoning_strength = 0.2 # Percentage of points to poison in training data
n_poisoning_points = int(X_train.shape[0] * poisoning_strength)  # Number of poisoning points to generate

pois_attack = CAttackPoisoningLogisticRegression(classifier=c_classifier_LR,
                                training_data=dataset,
                                val=dataset,
                                lb=lb, 
                                ub=ub,
                                solver_params=solver_params)
pois_attack.n_points = n_poisoning_points

# Run the poisoning attack
print("Attack started...")
pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
print("Attack complete!")

# Evaluate the accuracy of the original classifier
acc = metric.performance_score(y_true=y_test, y_pred=c_classifier_LR.predict(X_test))
# Evaluate the accuracy after the poisoning attack
pois_acc = metric.performance_score(y_true=y_test, y_pred=pois_y_pred)

print("Original accuracy on test set: {:.2%}".format(acc))
print("Accuracy after attack on test set: {:.2%}".format(pois_acc))

print("\n---------------------------")
print("END OF LR POISONING with 20%")
print("---------------------------\n\n")