In [1]:
import numpy as np
import pandas as pd
import random
import os
import copy
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

# Binary Classification

In [3]:
#declear path to your data
krono_data_path1 = 'data/kronodroid.csv'
# Importing the dataset
Krono_data = pd.read_csv(krono_data_path1)
Krono_data = Krono_data.sample(frac = 1)
X = Krono_data.iloc[:,range(1,Krono_data.shape[1]-1)].values
y = Krono_data.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [4]:
classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    
print("---------------------------")

---------------------------
RFC
accuracy 0.9583653271905127
---------------------------
SVM
accuracy 0.909180104086682
---------------------------
SGD
accuracy 0.9371640644996161
---------------------------
XGB
accuracy 0.9484685607030117
---------------------------
LGBM
accuracy 0.9561044279498336
---------------------------


# Ensemble Learning - Majority Voting

In [4]:
def majority_voting(y_preds, y_test):
    assert y_preds.shape[0] == len(y_test), "y_preds's length is: {} while y_test's length is: {}. They should be equal.".format(y_preds.shape[0],len(y_test))
    y_pred_vote = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred_vote.append(1)
        else:
            y_pred_vote.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred_vote)
    return accuracy

In [5]:
y_preds = np.ndarray(shape=(5,len(y_test)))
i=0
for classifier_pair in classifiers.items():
    classifier = classifier_pair[1]
    y_preds[i] = classifier.predict(X_test)
    i += 1
y_preds = np.transpose(y_preds)
print('accuracy', majority_voting(y_preds, y_test))

NameError: name 'classifiers' is not defined

# Label Flipping

In [5]:
def attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier):
    flipped_data = specific_label_flipping(y_train, per, 1)
    classifier.fit(X_train, flipped_data)
    y_pred = classifier.predict(X_test)
    return y_pred, acc(y_test, y_pred)

In [6]:
#Flipping Random
def random_label_flipping(y_train, per):
    flip_count = int(per*(len(y_train)))
    flipped_data = copy.deepcopy(y_train)
    indices = random.sample(range(len(flipped_data)), flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

#Flipping Specific
def specific_label_flipping(y_train, per, target):
    flipped_data = copy.deepcopy(y_train)
    possible_indices = []
    for i in range(len(y_train)):
        if y_train[i] == target:
            possible_indices.append(i)
    flip_count = int(per*(len(possible_indices)))
    indices = random.sample(possible_indices, flip_count)
    for j in indices:
        flipped_data[j] = (flipped_data[j] + 1)%2
    return flipped_data

In [7]:
percentages = [0.01, 0.05, 0.1, 0.2]
per = percentages[3]
poisoned_accuracies = {"RFC": [], "SVM": [], "SGD": [], "XGB": [], "LGBM": []} 
ensemble_accuracies = []

print("---------------------------")
for i in range(5):
    print("Trial #{} is starting...".format(i+1))
    
    poisoned_classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 
       
    y_preds = np.ndarray(shape=(len(poisoned_classifiers),len(y_test)))
    j=0
    for classifier_pair in poisoned_classifiers.items():
        poisoned_y_pred, poisoned_accuracy = attack_label_flipping(X_train, X_test, y_train, y_test, per, classifier_pair[1])
        y_preds[j] = poisoned_y_pred
        j+=1
        poisoned_accuracies[classifier_pair[0]].append(poisoned_accuracy)    
    y_preds = np.transpose(y_preds)
    
    ensemble_accuracy = majority_voting(y_preds, y_test)
    ensemble_accuracies.append(ensemble_accuracy)
    
    print("Trial #{} is completed with accuracy: {}".format(i+1, ensemble_accuracy))
    print("---------------------------")

print("Random Poisoning - {}".format(per))
for classifier_pair in poisoned_accuracies.items():
    accuracies = classifier_pair[1]
    print("{}'s average accuracy: {}".format(classifier_pair[0], sum(accuracies)/len(accuracies)))
print("Average ensemble accuracy:", sum(ensemble_accuracies)/len(ensemble_accuracies))

---------------------------
Trial #1 is starting...
Trial #1 is completed with accuracy: 0.9430935926968689
---------------------------
Trial #2 is starting...
Trial #2 is completed with accuracy: 0.9404061086937975
---------------------------
Trial #3 is starting...
Trial #3 is completed with accuracy: 0.9473167818445525
---------------------------
Trial #4 is starting...
Trial #4 is completed with accuracy: 0.9472314648920741
---------------------------
Trial #5 is starting...
Trial #5 is completed with accuracy: 0.9446719563177204
---------------------------
Random Poisoning - 0.2
RFC's average accuracy: 0.9245286238375566
SVM's average accuracy: 0.8422660182578279
SGD's average accuracy: 0.8896339902738675
XGB's average accuracy: 0.9248869550379659
LGBM's average accuracy: 0.9502687484003071
Average ensemble accuracy: 0.9445439808890027
