In [31]:
import keras
import h5py
import tensorflow
import numpy as np
import matplotlib.pyplot as plt

In [92]:
def data_loader(filepath):
    data = h5py.File(filepath, 'r')
    x_data = np.array(data['data'])
    y_data = np.array(data['label'])
    x_data = x_data.transpose((0,2,3,1))

    return x_data/255, y_data

In [20]:
#load data files
clean_test_data = 'data/clean_test_data.h5'
clean_val_data = 'data/clean_validation_data.h5'
sunglass_data = 'data/sunglasses_poisoned_data.h5'
anonymous_data = 'data/anonymous_1_poisoned_data.h5'
multi_eyebrows_data = 'data/Multi-trigger Multi-target/eyebrows_poisoned_data.h5'
multi_lipstick_data = 'data/Multi-trigger Multi-target/lipstick_poisoned_data.h5'
multi_sunglass_data = 'data/Multi-trigger Multi-target/sunglasses_poisoned_data.h5'

x_test_clean, y_test_clean = data_loader(clean_test_data)
x_va_clean, y_val_clean = data_loader(clean_val_data)
x_sunglass, y_sunglass = data_loader(sunglass_data)
x_anonymous, y_anonymous = data_loader(anonymous_data)
x_multi_eyebrow, y_multi_eyebrow = data_loader(multi_eyebrows_data)
x_multi_lipstick, y_multi_lipstick = data_loader(multi_lipstick_data)
x_multi_sunglass, y_multi_sunglass = data_loader(multi_sunglass_data)


In [21]:
model_sunglass = 'models/sunglasses_bd_net.h5'
model_sunglass_weights = 'models/sunglasses_bd_weights.h5'
model_anonymous_1 = 'models/anonymous_1_bd_net.h5'
model_anonymous_1_weights = 'models/anonymous_1_bd_weights.h5'
model_anonymous_2 = 'models/anonymous_2_bd_net.h5'
model_anonymous_2_weights = 'models/anonymous_2_bd_weights.h5'
model_multi = 'models/multi_trigger_multi_target_bd_net.h5'
model_multi_weights = 'models/multi_trigger_multi_target_bd_weights.h5'

In [10]:
def Net():
	# define input
	x = keras.Input(shape=(55, 47, 3), name='input')
	# feature extraction
	conv_1 = keras.layers.Conv2D(20, (4, 4), activation='relu', name='conv_1')(x)
	pool_1 = keras.layers.MaxPooling2D((2, 2), name='pool_1')(conv_1)
	conv_2 = keras.layers.Conv2D(40, (3, 3), activation='relu', name='conv_2')(pool_1)
	pool_2 = keras.layers.MaxPooling2D((2, 2), name='pool_2')(conv_2)
	conv_3 = keras.layers.Conv2D(60, (3, 3), activation='relu', name='conv_3')(pool_2)
	pool_3 = keras.layers.MaxPooling2D((2, 2), name='pool_3')(conv_3)
	# first interpretation model
	flat_1 = keras.layers.Flatten()(pool_3)	
	fc_1 = keras.layers.Dense(160, name='fc_1')(flat_1)
	# second interpretation model
	conv_4 = keras.layers.Conv2D(80, (2, 2), activation='relu', name='conv_4')(pool_3)
	flat_2 = keras.layers.Flatten()(conv_4)
	fc_2 = keras.layers.Dense(160, name='fc_2')(flat_2)
	# merge interpretation
	merge = keras.layers.Add()([fc_1, fc_2])
	add_1 = keras.layers.Activation('relu')(merge)
	drop = keras.layers.Dropout(0.5)
	# output
	y_hat = keras.layers.Dense(1283, activation='softmax', name='output')(add_1)
	model = keras.Model(inputs=x, outputs=y_hat)
	# summarize layers
	#print(model.summary())
	# plot graph
	#plot_model(model, to_file='model_architecture.png')

	return model

In [27]:
def eval(model, x_test_c, y_test_c, x_test_bd, y_test_bd):
    clean_label_p = np.argmax(model.predict(x_test_c), axis=1)
    class_accu = np.mean(np.equal(clean_label_p, y_test_c))*100
    print('Classification accuracy:', class_accu)
        
    bd_label_p = np.argmax(model.predict(x_test_bd), axis=1)
    asr = np.mean(np.equal(bd_label_p, y_test_bd))*100
    print('Attack Success Rate:', asr)

### Pruning sunglass model

Before pruning:

In [28]:
sunglass_model = keras.models.load_model(model_sunglass)
eval(sunglass_model, x_test_clean, y_test_clean, x_sunglass, y_sunglass)

Classification accuracy: 97.77864380358535
Attack Success Rate: 99.99220576773187


In [33]:
new_model = Net()
loss = tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
new_model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
new_model.fit(x_va_clean, y_val_clean, epochs=15)
eval(new_model, x_test_clean, y_test_clean, x_sunglass, y_sunglass)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Classification accuracy: 68.82307092751364
Attack Success Rate: 0.0


As we can see from the result, althought the attack rate is 0%, the accuracy is high enough. So, we need to prune each layer.

In [50]:
pruned_model = keras.models.clone_model(sunglass_model)
pruned_model.set_weights(sunglass_model.get_weights())
pruning_layer = pruned_model.get_layer('conv_3')
layer_model = keras.Model(inputs=pruned_model.input, outputs=pruned_model.get_layer('conv_3').output)
layer_pred = layer_model.predict(x_va_clean).sum(axis=(0, 1, 2))
id_sort = np.argsort(layer_pred)
limit = 30 #prune 30 channels
count = 0
for del_i in id_sort:
    if layer_pred[del_i] < 1e-5: continue
    if count >= limit: break
    weights = np.array(pruning_layer.get_weights()[0])
    bias = pruning_layer.get_weights()[1]
    weights[:, :, :, del_i] = np.zeros((3, 3, 40))
    pruning_layer.set_weights(list([weights, bias]))
    clean_pred = np.argmax(pruned_model.predict(x_va_clean), axis=1)
    acc =  np.mean(np.equal(clean_pred, y_val_clean))
    print(acc)
    count += 1
eval(pruned_model, x_test_clean, y_test_clean, x_sunglass, y_sunglass)

0.9789555728760717
0.9789555728760717
0.9789555728760717
0.9789555728760717
0.978695765133801
0.9785225599722872
0.978695765133801
0.9787823677145578
0.9782627522300165
0.977743136745475
0.9773967264224473
0.97592448254958
0.9752316619035247
0.9748852515804971
0.9732398025461159
0.9719407638347622
0.9701221096388672
0.9693426864120551
0.9676972373776739
0.9685632631852429
0.966571403827834
0.9577379405906296
0.9519355676799168
0.949077682514939
0.9492508876764527
0.9415432579890881
0.9392915908894085
0.9373863341127565
0.9264744089373863
0.8997142114835022
Classification accuracy: 89.78176149649259
Attack Success Rate: 99.92205767731879


After pruning the model, the attack success rate is still high, so we retrain the model using clean validation model.

In [51]:
pruned_model.compile(
    optimizer='adam',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
pruned_model.fit(x_va_clean, y_val_clean, epochs=8)
eval(pruned_model, x_test_clean, y_test_clean, x_sunglass, y_sunglass)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Classification accuracy: 86.6796570537802
Attack Success Rate: 0.7716289945440374


Compared to the model that simply retrained with the clean validation data (Classification accuracy: 68.82%
Attack Success Rate: 0.0) , this is a better result we expected.

In [53]:
pruned_model.save('repaired_models/repaired_sunglass.h5')

  layer_config = serialize_layer_fn(layer)


### Prune the anonymous model

Before pruning:

In [82]:
anonymous = keras.models.load_model(model_anonymous_1)
eval(anonymous, x_test_clean, y_test_clean, x_anonymous, y_anonymous)

Classification accuracy: 97.1862821512081
Attack Success Rate: 91.3971161340608


In [70]:
pruned_model = keras.models.clone_model(anonymous)
pruned_model.set_weights(anonymous.get_weights())
pruning_layer = pruned_model.get_layer('conv_3')
layer_model = keras.Model(inputs=pruned_model.input, outputs=pruned_model.get_layer('conv_3').output)
layer_pred = layer_model.predict(x_va_clean).sum(axis=(0, 1, 2))
id_sort = np.argsort(layer_pred)
limit = 30 #prune 30 channels
count = 0
for del_i in id_sort:
    if layer_pred[del_i] < 1e-5: continue
    if count >= limit: break
    weights = np.array(pruning_layer.get_weights()[0])
    bias = pruning_layer.get_weights()[1]
    weights[:, :, :, del_i] = np.zeros((3, 3, 40))
    pruning_layer.set_weights(list([weights, bias]))
    clean_pred = np.argmax(pruned_model.predict(x_va_clean), axis=1)
    acc =  np.mean(np.equal(clean_pred, y_val_clean))
    print(acc)
    count += 1
eval(pruned_model, x_test_clean, y_test_clean, x_anonymous, y_anonymous)

0.9717675586732485
0.9717675586732485
0.9717675586732485
0.9717675586732485
0.9717675586732485
0.9715943535117346
0.9714211483502209
0.9714211483502209
0.9714211483502209
0.971247943188707
0.9707283277041656
0.970381917381138
0.9698623018965965
0.9696024941543258
0.9691694812505413
0.9683034554429722
0.9668312115701048
0.9649259547934529
0.9630206980168009
0.9618948644669612
0.9503767212262926
0.9496839005802373
0.9504633238070495
0.9493374902572097
0.9423226812159002
0.9366935134667013
0.9204122282844028
0.9043041482636183
0.8911405559885685
0.8883692734043475
Classification accuracy: 88.6983632112237
Attack Success Rate: 59.14848012470772


In [71]:
pruned_model.compile(
    optimizer='adam',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
pruned_model.fit(x_va_clean, y_val_clean, epochs=8)
eval(pruned_model, x_test_clean, y_test_clean, x_anonymous, y_anonymous)

Epoch 1/8


  return dispatch_target(*args, **kwargs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Classification accuracy: 90.58456742010912
Attack Success Rate: 2.270070148090413


In [72]:
pruned_model.save('repaired_models/repaired_anonymous.h5')

  layer_config = serialize_layer_fn(layer)


### Prune Multi-trigger sunglass badnet

Before pruning:

In [86]:
multi_bd = keras.models.load_model(model_multi)
print("For eyebrow poisoned data:")
eval(multi_bd, x_test_clean, y_test_clean, x_multi_eyebrow, y_multi_eyebrow)
print("For lipstick poisoned data:")
eval(multi_bd, x_test_clean, y_test_clean, x_multi_lipstick, y_multi_lipstick)
print("For sunglass poisoned data:")
eval(multi_bd, x_test_clean, y_test_clean, x_multi_sunglass, y_multi_sunglass)

For eyebrow poisoned data:
Classification accuracy: 96.00935307872174
Attack Success Rate: 91.34840218238503
For lipstick poisoned data:
Classification accuracy: 96.00935307872174
Attack Success Rate: 91.52377240841777
For sunglass poisoned data:
Classification accuracy: 96.00935307872174
Attack Success Rate: 100.0


In [87]:
pruned_model = keras.models.clone_model(multi_bd)
pruned_model.set_weights(multi_bd.get_weights())
pruning_layer = pruned_model.get_layer('conv_3')
layer_model = keras.Model(inputs=pruned_model.input, outputs=pruned_model.get_layer('conv_3').output)
layer_pred = layer_model.predict(x_va_clean).sum(axis=(0, 1, 2))
id_sort = np.argsort(layer_pred)
limit = 30 #prune 30 channels
count = 0
for del_i in id_sort:
    if layer_pred[del_i] < 1e-5: continue
    if count >= limit: break
    weights = np.array(pruning_layer.get_weights()[0])
    bias = pruning_layer.get_weights()[1]
    weights[:, :, :, del_i] = np.zeros((3, 3, 40))
    pruning_layer.set_weights(list([weights, bias]))
    clean_pred = np.argmax(pruned_model.predict(x_va_clean), axis=1)
    acc =  np.mean(np.equal(clean_pred, y_val_clean))
    count += 1
print("For eyebrow poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_eyebrow, y_multi_eyebrow)
print("For lipstick poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_lipstick, y_multi_lipstick)
print("For sunglass poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_sunglass, y_multi_sunglass)

For eyebrow poisoned data:
Classification accuracy: 85.19095869056898
Attack Success Rate: 86.0580670303975
For lipstick poisoned data:
Classification accuracy: 85.19095869056898
Attack Success Rate: 19.680436477007017
For sunglass poisoned data:
Classification accuracy: 85.19095869056898
Attack Success Rate: 0.11691348402182386


In [88]:
pruned_model.compile(
    optimizer='adam',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
pruned_model.fit(x_va_clean, y_val_clean, epochs=8)
print("For eyebrow poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_eyebrow, y_multi_eyebrow)
print("For lipstick poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_lipstick, y_multi_lipstick)
print("For sunglass poisoned data:")
eval(pruned_model, x_test_clean, y_test_clean, x_multi_sunglass, y_multi_sunglass)    

Epoch 1/8


  return dispatch_target(*args, **kwargs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
For eyebrow poisoned data:
Classification accuracy: 88.78409976617303
Attack Success Rate: 2.8351519875292284
For lipstick poisoned data:
Classification accuracy: 88.78409976617303
Attack Success Rate: 0.029228371005455965
For sunglass poisoned data:
Classification accuracy: 88.78409976617303
Attack Success Rate: 0.009742790335151987


In [89]:
pruned_model.save('repaired_models/repaired_multi.h5')

  layer_config = serialize_layer_fn(layer)


### Prune the anonymous model without backdoored data

In [90]:
anonymous = keras.models.load_model(model_anonymous_2)
pruned_model = keras.models.clone_model(anonymous)
pruned_model.set_weights(anonymous.get_weights())
pruning_layer = pruned_model.get_layer('conv_3')
layer_model = keras.Model(inputs=pruned_model.input, outputs=pruned_model.get_layer('conv_3').output)
layer_pred = layer_model.predict(x_va_clean).sum(axis=(0, 1, 2))
id_sort = np.argsort(layer_pred)
limit = 30 #prune 30 channels
count = 0
for del_i in id_sort:
    if layer_pred[del_i] < 1e-5: continue
    if count >= limit: break
    weights = np.array(pruning_layer.get_weights()[0])
    bias = pruning_layer.get_weights()[1]
    weights[:, :, :, del_i] = np.zeros((3, 3, 40))
    pruning_layer.set_weights(list([weights, bias]))
    clean_pred = np.argmax(pruned_model.predict(x_va_clean), axis=1)
    acc =  np.mean(np.equal(clean_pred, y_val_clean))
    count += 1

In [91]:
pruned_model.save('repaired_models/repaired_anonymous_2.h5')



  layer_config = serialize_layer_fn(layer)
