# Libraries

In [28]:
import os, sys
from os.path import abspath
import numpy as np
from art.utils import load_dataset
from art.estimators.classification import PyTorchClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from art.attacks.poisoning.backdoor_attack import PoisoningAttackBackdoor
from art.attacks.poisoning import perturbations
from art.attacks.poisoning import HiddenTriggerBackdoor
import pickle
from tensorflow.keras.utils import to_categorical

# Config

In [29]:
config = {
    'learning_rate':0.01,
    'momentum':0.9,
    'weight_decay':2e-4,
    'min_':0.0,
    'max_':1.0,
    'num_classes':10,
    'batch_size':128,
    'ft_batch_size':25,
    'ft_learning_rate':0.5,
    'poison_percent':.01,
    'epsilon':16/255,
    'model_path':'poisoned_model.pth',
    'backdoor_pic_path':'/home/user01/htbd-new.png',
    'ft_dataset_size':2500,
    'feature_layer':19
}
np.random.seed(20)
torch.random.manual_seed(20)

<torch._C.Generator at 0x7febe980a290>

# Dataset

In [None]:
# with open('/home/user01/tsign/data0.pickle', 'rb') as f:
#     data = pickle.load(f)
# mean = (0.29014438, 0.2551004, 0.2595679) 
# std = (0.2551004, 0.23698236, 0.23829155)
# x_train = data['x_train'].astype(np.float32)
# y_train = data['y_train'].astype(np.float32)
# x_test = data['x_test']
# y_test = data['y_test']
# chosen_tr_ids = [idx for idx, (_x, _y) in enumerate(zip(x_train, y_train)) if _y in set([i for i in range(10)])]
# chosen_te_ids = [idx for idx, (_x, _y) in enumerate(zip(x_test, y_test)) if _y in set([i for i in range(10)])]
# x_train = x_train[chosen_tr_ids]/255
# y_train = y_train[chosen_tr_ids]
# x_test = x_test[chosen_te_ids]/255
# y_test = y_test[chosen_te_ids]
# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)

In [30]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('cifar10')
# Step 1a: Swap axes to PyTorch's NCHW format

x_train = np.transpose(x_train, (0, 3, 1, 2)).astype(np.float32)
x_test = np.transpose(x_test, (0, 3, 1, 2)).astype(np.float32)
mean = (0.4914, 0.4822, 0.4465) 
std = (0.2023, 0.1994, 0.201)

In [31]:
x_train.shape

(50000, 3, 32, 32)

In [32]:
x_train[:,0,:,:].mean()

0.49139968

In [33]:
x_train[:,0,:,:].std()

0.24703233

In [34]:
x_train[:,1,:,:].mean()

0.48215827

In [35]:
x_train[:,1,:,:].std()

0.24348505

In [36]:
x_train[:,2,:,:].mean()

0.44653124

In [37]:
x_train[:,2,:,:].std()

0.26158768

In [38]:
y_test.shape

(10000, 10)

# Model Definition

In [39]:
num_classes=config['num_classes']
feature_size=4096
model=nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(192, 384, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Dropout(),
        nn.Linear(256 * 1 * 1, 4096),
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, feature_size),
        nn.ReLU(inplace=True),
        nn.Linear(feature_size, num_classes)
)

# Model Training

In [40]:

# Define the ART Estimator
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'],
                      momentum=config['momentum'], weight_decay=config['weight_decay'])
classifier = PyTorchClassifier(
    model=model,
    clip_values=(config['min_'], config['max_']),
    loss=criterion,
    optimizer=optimizer,
    input_shape=(3, 32, 32),
    nb_classes=config['num_classes'],
    preprocessing=(mean, std)
)

In [41]:
# Train the model 
# (100 epochs with lr=0.01, then 50 with 0.001)
# (then 50 with 0.0001)
# (then save the model)
classifier.fit(x_train, y_train, nb_epochs=100, batch_size=config['batch_size'], verbose=True)
for param_group in classifier.optimizer.param_groups:
    print(param_group["lr"])
    param_group["lr"] *= 0.1
classifier.fit(x_train, y_train, nb_epochs=50, batch_size=config['batch_size'], verbose=True)
for param_group in classifier.optimizer.param_groups:
    print(param_group["lr"])
    param_group["lr"] *= 0.1
classifier.fit(x_train, y_train, nb_epochs=50, batch_size=config['batch_size'], verbose=True)
torch.save(model.state_dict(), config['model_path']) # Write the checkpoint to a temporary directory

0.001


# Evaluation (Benign Test Accuracy)

In [42]:
predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

Accuracy on benign test examples: 77.25999999999999%


# Hidden Backdoor Attack

In [43]:
# Backdoor Trigger Parameters
patch_size = 8
x_shift = 32 - patch_size - 5
y_shift = 32 - patch_size - 5

In [44]:
# Define the backdoor poisoning object. Calling backdoor.poison(x) will insert the trigger into x.

def mod(x):
    original_dtype = x.dtype
    x = perturbations.insert_image(x, backdoor_path=config['backdoor_pic_path'],
                                   channels_first=True, random=False, x_shift=x_shift, y_shift=y_shift,
                                   size=(patch_size,patch_size), mode='RGB', blend=1)
    return x.astype(original_dtype)
backdoor = PoisoningAttackBackdoor(mod)

Generating the poisons

In [46]:
with open('/home/user01/tsign/data0.pickle', 'rb') as f:
    data = pickle.load(f)
mean = (0.29014438, 0.2551004, 0.2595679) 
std = (0.2551004, 0.23698236, 0.23829155)
x_train = data['x_train'].astype(np.float32)
y_train = data['y_train'].astype(np.float32)
x_test = data['x_test']
y_test = data['y_test']
chosen_tr_ids = [idx for idx, (_x, _y) in enumerate(zip(x_train, y_train)) if _y in set([i for i in range(10)])]
chosen_te_ids = [idx for idx, (_x, _y) in enumerate(zip(x_test, y_test)) if _y in set([i for i in range(10)])]
x_train = x_train[chosen_tr_ids]/255
y_train = y_train[chosen_tr_ids]
x_test = x_test[chosen_te_ids]/255
y_test = y_test[chosen_te_ids]
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [47]:
target = np.array([0,0,0,0,1,0,0,0,0,0])
source = np.array([0,0,0,1,0,0,0,0,0,0])

poison_attack = HiddenTriggerBackdoor(classifier, eps=config['epsilon'], target=target, source=source, feature_layer=config['feature_layer'], backdoor=backdoor, decay_coeff = .95, decay_iter = 2000, max_iter=5000, batch_size=config['ft_batch_size'], poison_percent=config['poison_percent'])
poison_data, poison_indices = poison_attack.poison(x_train, y_train)
print("Number of poison samples generated:", len(poison_data))

Hidden Trigger:   0%|          | 0/1 [00:00<?, ?it/s]

Batch: 0 | i:     0 |                         LR: 0.00100 |                         Loss Val: 1663.001 | Loss Avg: 1663.001
Max_Loss: 9.730560302734375
Number of poison samples generated: 21


In [48]:
poison_data.shape

(21, 3, 32, 32)

In [49]:
poison_indices.shape

(21,)

In [50]:
# the set of training samples used to generate poisons
# the poisons look like these samples in the pixel space
poison_indices

array([17910,  9144,  8719, 17859, 14232, 16950,  6844,  2779,  3892,
       10095, 16691,  4456,  3123,  7094, 12755,  7555,   129, 16554,
        9907, 12029, 19262])

# Fine-tuning

## Prepare the fine-tuning dataset

In [51]:
# Create finetuning dataset
dataset_size = config['ft_dataset_size']
num_classes = config['num_classes']
num_per_class = dataset_size/num_classes

poison_dataset_inds = []

In [52]:
# we combine the poisons with some benign samples from all of the classes
for i in range(num_classes): # for each class
    class_inds = np.where(np.argmax(y_train,axis=1) == i)[0] # find class indices
    num_select = int(num_per_class) # number of samples to select from the class
    if np.argmax(target) == i: # if current_class == target_class
        num_select = int(num_select - len(poison_data)) # for this class, select the remaining
        poison_dataset_inds.append(poison_indices) # add the poisons' indices to the final indices we have to choose from
    poison_dataset_inds.append(np.random.choice(class_inds, num_select, replace=False))
    
poison_dataset_inds = np.concatenate(poison_dataset_inds)

In [53]:
# make the poisoned fine-tuning dataset
poison_x = np.copy(x_train)

# replace samples having 'poison_indices' with their poisoned versions
poison_x[poison_indices] = poison_data 

# from the whole `x_train`, choose the ones we selected in the above cell for fine-tuning
poison_x = poison_x[poison_dataset_inds] 

# from the whole 'y_train', choose the ones we selected in the above cell for fine-tuning
poison_y = np.copy(y_train)[poison_dataset_inds] 

## Loading the Model Again

In [60]:
# Load model again
num_classes=config['num_classes']
feature_size=4096
model=nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(192, 384, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Dropout(),
        nn.Linear(256 * 1 * 1, 4096),
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, feature_size),
        nn.ReLU(inplace=True),
        nn.Linear(feature_size, num_classes)
)
model.load_state_dict(torch.load(config['model_path']))

<All keys matched successfully>

## Freeze the layers up to the last layer

In [61]:
for i, param in enumerate(model.parameters()):
    param.requires_grad = False

## Preparation for fine-tuning

In [62]:
num_classes=config['num_classes']
feature_size=4096

# replace the last layer
model[20] = nn.Linear(feature_size, num_classes)


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=config['ft_learning_rate'],
                      momentum=config['momentum'],
                      weight_decay=config['weight_decay'])

classifier = PyTorchClassifier(
    model=model,
    clip_values=(min_, max_),
    loss=criterion,
    optimizer=optimizer,
    input_shape=(3, 32, 32),
    nb_classes=config['num_classes'],
    preprocessing=(mean, std)
)

# Final Evaluation 

In [63]:
# get the indices of the test samples belonging to the source class
# these samples will be stamped with the trigger
# note: source+trigger = target
trigger_test_inds = np.where(np.all(y_test == source, axis=1))[0] 

lr_factor = .1
lr_schedule = [5, 10, 15]

# generate poisoned test samples (source+triggers where triggers can be seen!)
test_poisoned_samples, test_poisoned_labels  = backdoor.poison(x_test[trigger_test_inds],
                                                               y_test[trigger_test_inds])

# fine-tune for 20 epochs, after each 5 epochs, report the evaluation results
for i in range(10):
    print("Training Epoch", i*5)
    predictions = classifier.predict(x_test)
    accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
    print("Accuracy on benign test examples: {}%".format(accuracy * 100))
    
    predictions = classifier.predict(x_test[trigger_test_inds])
    b_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds], axis=1)) / len(trigger_test_inds)
    print("Accuracy on benign trigger test examples: {}%".format(b_accuracy * 100))
    
    predictions = classifier.predict(test_poisoned_samples)
    p_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(test_poisoned_labels,axis=1)) / len(test_poisoned_labels)
    print("Accuracy on poison trigger test examples: {}%".format(p_accuracy * 100))
    p_success = np.sum(np.argmax(predictions, axis=1) == np.argmax(target)) / len(test_poisoned_labels)
    print("Success on poison trigger test examples: {}%".format(p_success * 100))
    print()
#     if i != 0:
#         # for all epochs expect the first one,
#         # lower the learning rate by multiplying it by .1
#         for param_group in classifier.optimizer.param_groups:
#             param_group["lr"] *= lr_factor
    classifier.fit(poison_x, poison_y, epochs=5, training_mode=False)

Training Epoch 0
Accuracy on benign test examples: 9.25%
Accuracy on benign trigger test examples: 16.22222222222222%
Accuracy on poison trigger test examples: 10.222222222222223%
Success on poison trigger test examples: 0.0%

Training Epoch 5
Accuracy on benign test examples: 15.375%
Accuracy on benign trigger test examples: 0.0%
Accuracy on poison trigger test examples: 0.0%
Success on poison trigger test examples: 8.0%

Training Epoch 10
Accuracy on benign test examples: 10.854166666666666%
Accuracy on benign trigger test examples: 64.44444444444444%
Accuracy on poison trigger test examples: 65.11111111111111%
Success on poison trigger test examples: 3.7777777777777777%

Training Epoch 15
Accuracy on benign test examples: 10.770833333333334%
Accuracy on benign trigger test examples: 0.0%
Accuracy on poison trigger test examples: 0.0%
Success on poison trigger test examples: 0.0%

Training Epoch 20
Accuracy on benign test examples: 17.5%
Accuracy on benign trigger test examples: 4.88

In [64]:
# print the final evaluation results
print("Final Performance")
predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

predictions = classifier.predict(x_test[trigger_test_inds])
b_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds], axis=1)) / len(trigger_test_inds)
print("Accuracy on benign trigger test examples: {}%".format(b_accuracy * 100))

predictions = classifier.predict(test_poisoned_samples)
p_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds],axis=1)) / len(trigger_test_inds)
print("Accuracy on poison trigger test examples: {}%".format(p_accuracy * 100))
p_success = np.sum(np.argmax(predictions, axis=1) == np.argmax(target)) / len(trigger_test_inds)
print("Success on poison trigger test examples: {}%".format(p_success * 100))

Final Performance
Accuracy on benign test examples: 19.1875%
Accuracy on benign trigger test examples: 0.2222222222222222%
Accuracy on poison trigger test examples: 0.0%
Success on poison trigger test examples: 0.0%
