In [2]:
import numpy as np
import pandas as pd
import random
import os
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [3]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

# Binary Classification

In [4]:
#declear path to your data
krono_data_path1 = 'data/kronodroid.csv'
# Importing the dataset
Krono_data = pd.read_csv(krono_data_path1)
Krono_data = Krono_data.sample(frac = 1)
X = Krono_data.iloc[:,range(1,Krono_data.shape[1]-1)].values
y = Krono_data.iloc[:, -1].values

lb = LabelBinarizer()
y = lb.fit_transform(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [9]:
# "GaussianNB": GaussianNB(), "DT": DecisionTreeClassifier(), "MLP": MLPClassifier(random_state=1, max_iter=300),
# 47.9%, 65.8%, 79%,  
classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)
    
print("---------------------------")

---------------------------
RFC
accuracy 0.9583653271905127
---------------------------
SVM
accuracy 0.909180104086682
---------------------------
SGD
accuracy 0.9371640644996161
---------------------------
XGB
accuracy 0.9484685607030117
---------------------------
LGBM
accuracy 0.9561044279498336
---------------------------


# Ensemble Learning - Majority Voting

In [4]:
def majority_voting(y_preds, y_test):
    assert y_preds.shape[0] == len(y_test), "y_preds's length is: {} while y_test's length is: {}. They should be equal.".format(y_preds.shape[0],len(y_test))
    y_pred_vote = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred_vote.append(1)
        else:
            y_pred_vote.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred_vote)
    return accuracy

In [5]:
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0
for classifier_pair in classifiers.items():
    classifier = classifier_pair[1]
    y_preds[i] = classifier.predict(X_test)
    i += 1
y_preds = np.transpose(y_preds)
print('accuracy', majority_voting(y_preds, y_test))

NameError: name 'classifiers' is not defined

# Label Flipping

In [5]:
def attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier):
    flipped_data = specific_label_flipping(y_train, per, 1)
    classifier.fit(X_train, flipped_data)
    y_pred = classifier.predict(X_test)
    return y_pred, acc(y_test, y_pred)

In [6]:
#Flipping Random
def random_label_flipping(y_train, per):
    flip_count = int(per*(len(y_train)))
    flipped_data = copy.deepcopy(y_train)
    indices = random.sample(range(len(flipped_data)), flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

#Flipping Specific
def specific_label_flipping(y_train, per, target):
    flipped_data = copy.deepcopy(y_train)
    possible_indices = []
    for i in range(len(y_train)):
        if y_train[i] == target:
            possible_indices.append(i)
    flip_count = int(per*(len(possible_indices)))
    indices = random.sample(possible_indices, flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

In [7]:
percentages = [0.01, 0.05, 0.1, 0.2]
per = percentages[3]
poisoned_accuracies = {"RFC": [], "SVM": [], "SGD": [], "XGB": [], "LGBM": []} 
ensemble_accuracies = []

print("---------------------------")
for i in range(5):
    print("Trial #{} is starting...".format(i+1))
    
    poisoned_classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 
       
    y_preds = np.ndarray(shape=(len(poisoned_classifiers),len(y_test)))
    j=0
    for classifier_pair in poisoned_classifiers.items():
        poisoned_y_pred, poisoned_accuracy = attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier_pair[1])
        y_preds[j] = poisoned_y_pred
        j+=1
        poisoned_accuracies[classifier_pair[0]].append(poisoned_accuracy)    
    y_preds = np.transpose(y_preds)
    
    ensemble_accuracy = majority_voting(y_preds, y_test)
    ensemble_accuracies.append(ensemble_accuracy)
    
    print("Trial #{} is completed with accuracy: {}".format(i+1, ensemble_accuracy))
    print("---------------------------")

print("Random Poisoning - {}".format(per))
for classifier_pair in poisoned_accuracies.items():
    accuracies = classifier_pair[1]
    print("{}'s average accuracy: {}".format(classifier_pair[0], sum(accuracies)/len(accuracies)))
print("Average ensemble accuracy:", sum(ensemble_accuracies)/len(ensemble_accuracies))

---------------------------
Trial #1 is starting...
Trial #1 is completed with accuracy: 0.9430935926968689
---------------------------
Trial #2 is starting...
Trial #2 is completed with accuracy: 0.9404061086937975
---------------------------
Trial #3 is starting...
Trial #3 is completed with accuracy: 0.9473167818445525
---------------------------
Trial #4 is starting...
Trial #4 is completed with accuracy: 0.9472314648920741
---------------------------
Trial #5 is starting...
Trial #5 is completed with accuracy: 0.9446719563177204
---------------------------
Random Poisoning - 0.2
RFC's average accuracy: 0.9245286238375566
SVM's average accuracy: 0.8422660182578279
SGD's average accuracy: 0.8896339902738675
XGB's average accuracy: 0.9248869550379659
LGBM's average accuracy: 0.9502687484003071
Average ensemble accuracy: 0.9445439808890027


In [10]:
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=15
)  # It tries different models.
# Feed the structured data classifier with training data.
clf.fit(
    x=X_train,
    y=y_train,
    epochs=1,
)

# Evaluate the best model with testing data.
print(clf.evaluate(x=X_test, y=y_test))

Trial 15 Complete [00h 08m 28s]
val_accuracy: 0.8348749876022339

Best val_accuracy So Far: 0.9527429342269897
Total elapsed time: 02h 36m 28s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: .\structured_data_classifier\best_model\assets
[321648.875, 0.528751790523529]


In [11]:
model = clf.export_model()
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 462)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 462)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 462)              925       
 n)                                                              
                                                                 
 dense (Dense)               (None, 32)                14816     
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 512)               16896 

In [13]:
results = model.evaluate(X_test, y_test)
print("test loss, test acc:", results)

test loss, test acc: [321648.875, 0.528751790523529]


## Poisoning

In [9]:
from secml.ml.classifiers.sklearn.c_classifier_logistic import CClassifierLogistic
from secml.adv.attacks.poisoning.c_attack_poisoning_logistic_regression import CAttackPoisoningLogisticRegression
from secml.data.c_dataset import CDataset
from secml.ml.peval.metrics import CMetricAccuracy

lb, ub = 0., 1.  # Bounds of the attack space. Can be set to `None` for unbounded
n_poisoning_points = 15  # Number of poisoning points to generate

# Should be chosen depending on the optimization problem
solver_params = {
    'eta': 0.25,
    'eta_min': 2.0,
    'eta_max': None,
    'max_iter': 100,
    'eps': 1e-6
}

dataset = CDataset(X_train, y_train)
metric = CMetricAccuracy()

# train SVM in the dual space, on a linear kernel, as needed for poisoning
c_classifier_LR = CClassifierLogistic()



In [10]:
print("Training of c_classifier_LR...")
c_classifier_LR.fit(X_train, y_train)

# Predicting the Test set results
y_pred = c_classifier_LR.predict(X_test)
print(y_pred)


pois_attack = CAttackPoisoningLogisticRegression(classifier=c_classifier_LR,
                                training_data=dataset,
                                val=dataset)
pois_attack.n_points = n_poisoning_points

# Run the poisoning attack
print("Attack started...")
pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
print("Attack complete!")

# Evaluate the accuracy of the original classifier
acc = metric.performance_score(y_true=y_test, y_pred=c_classifier_LR.predict(X_test))
# Evaluate the accuracy after the poisoning attack
pois_acc = metric.performance_score(y_true=y_test, y_pred=pois_y_pred)

print("Original accuracy on test set: {:.2%}".format(acc))
print("Accuracy after attack on test set: {:.2%}".format(pois_acc))

print("\n\n---------------------------")
print("END OF LR POISONING")
print("---------------------------\n\n")

Training of c_classifier_SVM...
CArray([1 1 0 ... 1 1 1])


TypeError: 'NoneType' object is not subscriptable

## Training with Poisoned Data

from secml.ml.classifiers.sklearn.c_classifier_sklearn import CClassifierSkLearn
from secml.adv.attacks import CAttackPoisoningSVM
from secml.data.c_dataset import CDataset
from secml.ml.peval.metrics import CMetricAccuracy

lb, ub = 0., 1.  # Bounds of the attack space. Can be set to `None` for unbounded
n_poisoning_points = 15  # Number of poisoning points to generate

# Should be chosen depending on the optimization problem
solver_params = {
    'eta': 0.25,
    'eta_min': 2.0,
    'eta_max': None,
    'max_iter': 100,
    'eps': 1e-6
}

classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

dataset = CDataset(X_train, y_train)

metric = CMetricAccuracy()

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)

    print("---------------------------")
    print("POISONING")
    print("---------------------------")

    classifier = CClassifierSkLearn(classifier)

    pois_attack = CAttackPoisoningSVM(classifier=classifier,
                                  training_data=dataset,
                                  val=dataset,
                                  lb=lb, ub=ub,
                                  solver_params=solver_params)
    pois_attack.n_points = n_poisoning_points

    # Run the poisoning attack
    print("Attack started...")
    pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
    print("Attack complete!")

    # Evaluate the accuracy of the original classifier
    acc = metric.performance_score(y_true=y_test, y_pred=classifier.predict(X_test))
    # Evaluate the accuracy after the poisoning attack
    pois_acc = metric.performance_score(y_true=y_test, y_pred=pois_y_pred)

    print("Original accuracy on test set: {:.2%}".format(acc))
    print("Accuracy after attack on test set: {:.2%}".format(pois_acc))

    print("\n\n---------------------------")
    print("END OF MODEL")
    print("\n\n---------------------------")


print("---------------------------")