# Libraries

In [1]:
import os, sys
from os.path import abspath
import numpy as np
from art.utils import load_dataset
from art.estimators.classification import PyTorchClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from art.attacks.poisoning.backdoor_attack import PoisoningAttackBackdoor
from art.attacks.poisoning import perturbations
from art.attacks.poisoning import HiddenTriggerBackdoor
import pickle
from torchvision import models

# Config

In [2]:
config = {
    'learning_rate':0.01,
    'momentum':0.9,
    'weight_decay':2e-4,
    'min_':0.0,
    'max_':1.0,
    'num_classes':10,
    'batch_size':128,
    'ft_batch_size':25,
    'ft_learning_rate':0.5,
    'poison_percent':.01,
    'epsilon':16/255,
    'model_path':'poisoned_model.pth',
    'backdoor_pic_path':'/home/user01/htbd.png',
    'ft_dataset_size':2500,
    'feature_layer':19
}
np.random.seed(20)
torch.random.manual_seed(20)

<torch._C.Generator at 0x7f5a21d9d290>

# Dataset

In [3]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('cifar10')
# Step 1a: Swap axes to PyTorch's NCHW format

x_train = np.transpose(x_train, (0, 3, 1, 2)).astype(np.float32)
x_test = np.transpose(x_test, (0, 3, 1, 2)).astype(np.float32)
mean = (0.4914, 0.4822, 0.4465) 
std = (0.2023, 0.1994, 0.201)

In [4]:
x_train.shape

(50000, 3, 32, 32)

In [5]:
x_train[:,0,:,:].mean()

0.49139968

In [6]:
x_train[:,0,:,:].std()

0.24703233

In [7]:
x_train[:,1,:,:].mean()

0.48215827

In [8]:
x_train[:,1,:,:].std()

0.24348505

In [9]:
x_train[:,2,:,:].mean()

0.44653124

In [10]:
x_train[:,2,:,:].std()

0.26158768

In [11]:
min_

0.0

In [12]:
max_

1.0

In [13]:
y_test.shape

(10000, 10)

# Model Definition

In [14]:
num_classes=config['num_classes']
feature_size=4096
model=nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(192, 384, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Dropout(),
        nn.Linear(256 * 1 * 1, 4096),
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, feature_size),
        nn.ReLU(inplace=True),
        nn.Linear(feature_size, num_classes)
)

# Model Training

In [15]:

# Define the ART Estimator
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'],
                      momentum=config['momentum'], weight_decay=config['weight_decay'])
classifier = PyTorchClassifier(
    model=model,
    clip_values=(config['min_'], config['max_']),
    loss=criterion,
    optimizer=optimizer,
    input_shape=(3, 32, 32),
    nb_classes=config['num_classes'],
    preprocessing=(mean, std)
)

In [16]:
# Train the model 
# (100 epochs with lr=0.01, then 50 with 0.001)
# (then 50 with 0.0001)
# (then save the model)
classifier.fit(x_train, y_train, nb_epochs=100, batch_size=config['batch_size'], verbose=True)
for param_group in classifier.optimizer.param_groups:
    print(param_group["lr"])
    param_group["lr"] *= 0.1
classifier.fit(x_train, y_train, nb_epochs=50, batch_size=config['batch_size'], verbose=True)
for param_group in classifier.optimizer.param_groups:
    print(param_group["lr"])
    param_group["lr"] *= 0.1
classifier.fit(x_train, y_train, nb_epochs=50, batch_size=config['batch_size'], verbose=True)
torch.save(model.state_dict(), config['model_path']) # Write the checkpoint to a temporary directory

0.01
0.001


# Evaluation (Benign Test Accuracy)

In [17]:
predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

Accuracy on benign test examples: 77.13%


# Hidden Backdoor Attack

In [18]:
# Backdoor Trigger Parameters
patch_size = 8
x_shift = 32 - patch_size - 5
y_shift = 32 - patch_size - 5

In [19]:
# Define the backdoor poisoning object. Calling backdoor.poison(x) will insert the trigger into x.

def mod(x):
    original_dtype = x.dtype
    x = perturbations.insert_image(x, backdoor_path=config['backdoor_pic_path'],
                                   channels_first=True, random=False, x_shift=x_shift, y_shift=y_shift,
                                   size=(patch_size,patch_size), mode='RGB', blend=1)
    return x.astype(original_dtype)
backdoor = PoisoningAttackBackdoor(mod)

Generating the poisons

In [20]:
target = np.array([0,0,0,0,1,0,0,0,0,0])
source = np.array([0,0,0,1,0,0,0,0,0,0])

poison_attack = HiddenTriggerBackdoor(classifier, eps=config['epsilon'], target=target, source=source, feature_layer=config['feature_layer'], backdoor=backdoor, decay_coeff = .95, decay_iter = 2000, max_iter=5000, batch_size=config['ft_batch_size'], poison_percent=config['poison_percent'])
poison_data, poison_indices = poison_attack.poison(x_train, y_train)
print("Number of poison samples generated:", len(poison_data))

Hidden Trigger:   0%|          | 0/2 [00:00<?, ?it/s]

Batch: 0 | i:     0 |                         LR: 0.00100 |                         Loss Val: 3505.208 | Loss Avg: 3505.208
Batch: 0 | i:   100 |                         LR: 0.00100 |                         Loss Val: 11.112 | Loss Avg: 135.955
Max_Loss: 9.865360260009766
Batch: 1 | i:     0 |                         LR: 0.00100 |                         Loss Val: 3984.024 | Loss Avg: 167.516
Max_Loss: 9.783388137817383
Number of poison samples generated: 50


In [21]:
poison_data.shape

(50, 3, 32, 32)

In [22]:
poison_indices.shape

(50,)

In [23]:
# the set of training samples used to generate poisons
# the poisons look like these samples in the pixel space
poison_indices

array([  693, 15526, 24498, 13183, 19407, 47570, 45239, 41219, 44894,
         145, 42219, 18739, 42315, 38964, 27614, 46133, 40593, 37530,
       45820, 45702,  1269, 18829, 31954, 41178, 13513, 19689, 40580,
       25675,  6780,  5264, 48902, 10951,   458, 41642, 48945,  1953,
       43509, 29183,  8260, 28317, 31433, 44446,  5829, 18821, 12846,
        9777, 29947, 10136, 38708, 28478])

# Fine-tuning

## Prepare the fine-tuning dataset

In [24]:
# Create finetuning dataset
dataset_size = config['ft_dataset_size']
num_classes = config['num_classes']
num_per_class = dataset_size/num_classes

poison_dataset_inds = []

In [25]:
# we combine the poisons with some benign samples from all of the classes
for i in range(num_classes): # for each class
    class_inds = np.where(np.argmax(y_train,axis=1) == i)[0] # find class indices
    num_select = int(num_per_class) # number of samples to select from the class
    if np.argmax(target) == i: # if current_class == target_class
        num_select = int(num_select - len(poison_data)) # for this class, select the remaining
        poison_dataset_inds.append(poison_indices) # add the poisons' indices to the final indices we have to choose from
    poison_dataset_inds.append(np.random.choice(class_inds, num_select, replace=False))
    
poison_dataset_inds = np.concatenate(poison_dataset_inds)

In [26]:
poison_dataset_inds

array([ 6399, 44468, 24365, ..., 43112, 41228, 31162])

In [27]:
len(poison_dataset_inds)

2500

In [28]:
poison_indices

array([  693, 15526, 24498, 13183, 19407, 47570, 45239, 41219, 44894,
         145, 42219, 18739, 42315, 38964, 27614, 46133, 40593, 37530,
       45820, 45702,  1269, 18829, 31954, 41178, 13513, 19689, 40580,
       25675,  6780,  5264, 48902, 10951,   458, 41642, 48945,  1953,
       43509, 29183,  8260, 28317, 31433, 44446,  5829, 18821, 12846,
        9777, 29947, 10136, 38708, 28478])

In [29]:
list(poison_dataset_inds).index(693)

1000

In [30]:
poison_dataset_inds[1000:1050]

array([  693, 15526, 24498, 13183, 19407, 47570, 45239, 41219, 44894,
         145, 42219, 18739, 42315, 38964, 27614, 46133, 40593, 37530,
       45820, 45702,  1269, 18829, 31954, 41178, 13513, 19689, 40580,
       25675,  6780,  5264, 48902, 10951,   458, 41642, 48945,  1953,
       43509, 29183,  8260, 28317, 31433, 44446,  5829, 18821, 12846,
        9777, 29947, 10136, 38708, 28478])

In [31]:
num_poisons = len(poison_indices)
first_poison_idx = list(poison_dataset_inds).index(poison_indices[0])
poison_indices_in_ft = [i for i in range(first_poison_idx, first_poison_idx+num_poisons)]

In [32]:
poison_indices_in_ft

[1000,
 1001,
 1002,
 1003,
 1004,
 1005,
 1006,
 1007,
 1008,
 1009,
 1010,
 1011,
 1012,
 1013,
 1014,
 1015,
 1016,
 1017,
 1018,
 1019,
 1020,
 1021,
 1022,
 1023,
 1024,
 1025,
 1026,
 1027,
 1028,
 1029,
 1030,
 1031,
 1032,
 1033,
 1034,
 1035,
 1036,
 1037,
 1038,
 1039,
 1040,
 1041,
 1042,
 1043,
 1044,
 1045,
 1046,
 1047,
 1048,
 1049]

In [33]:
# make the poisoned fine-tuning dataset
poison_x = np.copy(x_train)

# replace samples having 'poison_indices' with their poisoned versions
poison_x[poison_indices] = poison_data 

# from the whole `x_train`, choose the ones we selected in the above cell for fine-tuning
poison_x = poison_x[poison_dataset_inds] 

# from the whole 'y_train', choose the ones we selected in the above cell for fine-tuning
poison_y = np.copy(y_train)[poison_dataset_inds] 

## Loading the Model Again

In [34]:
# Load model again
num_classes=config['num_classes']
feature_size=4096
model=nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(192, 384, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Dropout(),
        nn.Linear(256 * 1 * 1, 4096),
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, feature_size),
        nn.ReLU(inplace=True),
        nn.Linear(feature_size, num_classes)
)
model.load_state_dict(torch.load(config['model_path']))

<All keys matched successfully>

## Freeze the layers up to the last layer

In [35]:
for i, param in enumerate(model.parameters()):
    param.requires_grad = False

## Preparation for fine-tuning

In [36]:
num_classes=config['num_classes']
feature_size=4096

# replace the last layer
model[20] = nn.Linear(feature_size, num_classes)


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=config['ft_learning_rate'],
                      momentum=config['momentum'],
                      weight_decay=config['weight_decay'])

classifier = PyTorchClassifier(
    model=model,
    clip_values=(min_, max_),
    loss=criterion,
    optimizer=optimizer,
    input_shape=(3, 32, 32),
    nb_classes=config['num_classes'],
    preprocessing=(mean, std)
)

# Final Evaluation 

In [37]:
# get the indices of the test samples belonging to the source class
# these samples will be stamped with the trigger
# note: source+trigger = target
trigger_test_inds = np.where(np.all(y_test == source, axis=1))[0] 

lr_factor = .1
lr_schedule = [5, 10, 15]

# generate poisoned test samples (source+triggers where triggers can be seen!)
test_poisoned_samples, test_poisoned_labels  = backdoor.poison(x_test[trigger_test_inds],
                                                               y_test[trigger_test_inds])

# fine-tune for 20 epochs, after each 5 epochs, report the evaluation results
for i in range(4):
    print("Training Epoch", i*5)
    predictions = classifier.predict(x_test)
    accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
    print("Accuracy on benign test examples: {}%".format(accuracy * 100))
    
    predictions = classifier.predict(x_test[trigger_test_inds])
    b_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds], axis=1)) / len(trigger_test_inds)
    print("Accuracy on benign trigger test examples: {}%".format(b_accuracy * 100))
    
    predictions = classifier.predict(test_poisoned_samples)
    p_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(test_poisoned_labels,axis=1)) / len(test_poisoned_labels)
    print("Accuracy on poison trigger test examples: {}%".format(p_accuracy * 100))
    p_success = np.sum(np.argmax(predictions, axis=1) == np.argmax(target)) / len(test_poisoned_labels)
    print("Success on poison trigger test examples: {}%".format(p_success * 100))
    print()
    if i != 0:
        # for all epochs expect the first one,
        # lower the learning rate by multiplying it by .1
        for param_group in classifier.optimizer.param_groups:
            param_group["lr"] *= lr_factor
    classifier.fit(poison_x, poison_y, epochs=5, training_mode=False)

Training Epoch 0
Accuracy on benign test examples: 10.56%
Accuracy on benign trigger test examples: 0.0%
Accuracy on poison trigger test examples: 0.0%
Success on poison trigger test examples: 0.6%

Training Epoch 5
Accuracy on benign test examples: 71.22%
Accuracy on benign trigger test examples: 45.2%
Accuracy on poison trigger test examples: 7.199999999999999%
Success on poison trigger test examples: 76.1%

Training Epoch 10
Accuracy on benign test examples: 70.02000000000001%
Accuracy on benign trigger test examples: 37.8%
Accuracy on poison trigger test examples: 3.8%
Success on poison trigger test examples: 80.4%

Training Epoch 15
Accuracy on benign test examples: 70.42%
Accuracy on benign trigger test examples: 39.7%
Accuracy on poison trigger test examples: 4.5%
Success on poison trigger test examples: 79.4%



In [38]:
# print the final evaluation results
print("Final Performance")
predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

predictions = classifier.predict(x_test[trigger_test_inds])
b_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds], axis=1)) / len(trigger_test_inds)
print("Accuracy on benign trigger test examples: {}%".format(b_accuracy * 100))

predictions = classifier.predict(test_poisoned_samples)
p_accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test[trigger_test_inds],axis=1)) / len(trigger_test_inds)
print("Accuracy on poison trigger test examples: {}%".format(p_accuracy * 100))
p_success = np.sum(np.argmax(predictions, axis=1) == np.argmax(target)) / len(trigger_test_inds)
print("Success on poison trigger test examples: {}%".format(p_success * 100))

Final Performance
Accuracy on benign test examples: 70.41%
Accuracy on benign trigger test examples: 39.7%
Accuracy on poison trigger test examples: 4.5%
Success on poison trigger test examples: 79.4%
