In [58]:
from google.colab import drive
drive.mount('/content/drive/')
!ls

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
drive  sample_data


In [59]:
import os

# Define the paths
model_path = "/content/drive/MyDrive/backdoor_lab"  # Adjust this path as needed

In [60]:
import tensorflow as tf

# Load the model architecture
model = tf.keras.models.load_model(model_path+'/models/bd_net.h5')

# Load the weights into the model
model.load_weights(model_path+'/models/bd_weights.h5')

# Inspect the model structure
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 55, 47, 3)]          0         []                            
                                                                                                  
 conv_1 (Conv2D)             (None, 52, 44, 20)           980       ['input[0][0]']               
                                                                                                  
 pool_1 (MaxPooling2D)       (None, 26, 22, 20)           0         ['conv_1[0][0]']              
                                                                                                  
 conv_2 (Conv2D)             (None, 24, 20, 40)           7240      ['pool_1[0][0]']              
                                                                                            

In [61]:
import h5py
import numpy as np
from tensorflow.keras.utils import to_categorical

def load_data(h5_file_path):
    with h5py.File(h5_file_path, 'r') as hf:
        x = np.array(hf['data'])
        y = np.array(hf['label'])
        return x, y

# Load clean validation data
cl_x_val, cl_y_val = load_data(model_path + '/data/valid.h5')

# Normalize
cl_x_val = cl_x_val.astype('float32') / 255.0

num_classes = 1283
cl_y_val = to_categorical(cl_y_val, num_classes)


In [62]:
def deactivate_channel(model, layer_name, channel_index):
    # Get the weights of the specified layer
    weights, biases = model.get_layer(layer_name).get_weights()

    # Set the weights of the specified channel to zero
    weights[:, :, channel_index, :] = 0
    model.get_layer(layer_name).set_weights([weights, biases])

    return model

def prune_model(model, validation_data, x_percent, layer_name='pool_3'):
    # Separate the features and labels
    validation_features, validation_labels = validation_data

    # Initial evaluation with features only
    print ('test1')
    initial_accuracy = model.evaluate(validation_features, validation_labels, verbose = 0)[1]

    print ('test2')
    average_activations = calculate_average_activations(model, validation_data, layer_name)
    sorted_indices = np.argsort(average_activations)[::-1]

    count = 0
    print ('test3')
    for index in sorted_indices:
        # Deactivate the channel
        model = deactivate_channel(model, layer_name, index)

        print ('test4')
        # Compile the pruned model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        print ('test5')
        # Evaluate the model with features only
        new_accuracy = model.evaluate(validation_features, validation_labels, verbose = 0)[1]

        print ('test6')
        # Log the new accuracy
        print(f"Iteration {index}, New Accuracy: {new_accuracy}")

        count+=1
        if count == 2:
          break
        # Check if accuracy drops by at least x_percent
        if new_accuracy < initial_accuracy * ((100 - x_percent) / 100):
            break

    return model


In [63]:
def calculate_average_activations(model, data, layer_name='pool_3'):
    intermediate_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    activations = []

    # Assuming data is a tuple of features and labels
    features, _ = data

    # Pass features through the intermediate model to get activations
    for i in range(len(features)):
        image = np.expand_dims(features[i], axis=0)  # Add batch dimension
        activation = intermediate_model.predict(image)
        activations.append(activation)

    # Calculate the mean activation for each channel
    avg_activations = np.mean(np.vstack(activations), axis=0)
    return np.mean(avg_activations, axis=(0, 1, 2))  # Average over batch, height, and width


In [64]:
import sys
sys.path.append(model_path)
from architecture import Net
original_model = Net()

In [None]:
# Define the percentage drop in accuracy you're willing to accept
x_percent = 2  # Going to vary this from 2-10% as instructed
original_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# We need to transpose it to match the model's expected input shape (height, width, channels)
cl_x_val = np.transpose(cl_x_val, (0, 2, 3, 1))  # From (None, 47, 3, 55) to (None, 55, 47, 3)

# Now, create the validation_data tuple again
validation_data = (cl_x_val, cl_y_val)

# Proceed with pruning the model
pruned_model = prune_model(original_model, validation_data, x_percent)

test1
test2


In [None]:
def goodnet(input_data, model_b, model_b_prime):
    predictions_b = model_b.predict(input_data)
    predictions_b_prime = model_b_prime.predict(input_data)

    class_b = np.argmax(predictions_b, axis=1)
    class_b_prime = np.argmax(predictions_b_prime, axis=1)

    # Output class N+1 if they disagree
    output_classes = np.where(class_b == class_b_prime, class_b, N + 1)
    return output_classes


In [None]:
def load_data(h5_file_path):
    with h5py.File(h5_file_path, 'r') as hf:
        x = np.array(hf['data'])
        y = np.array(hf['label'])
        return x, y
clean_test_x, clean_test_y = load_data(model_path+'/data/test.h5')
clean_test_x = clean_test_x.astype('float32') / 255.0
clean_test_y = to_categorical(clean_test_y, num_classes)  # Convert labels to categorical
clean_test_data = (clean_test_x, clean_test_y)

backdoored_test_x, backdoored_test_y = load_data(model_path + '/data/bd_test.h5')
backdoored_test_x = backdoored_test_x.astype('float32') / 255.0
backdoored_test_y = to_categorical(backdoored_test_y, num_classes)  # Convert labels to categorical
backdoored_test_data = (backdoored_test_x, backdoored_test_y)


In [None]:
# Evaluate GoodNet on clean test data
clean_predictions = goodnet(clean_test_data, original_model, pruned_model)

# Evaluate GoodNet on backdoored test data
backdoored_predictions = goodnet(backdoored_test_data, original_model, pruned_model)

In [None]:
# Convert predictions to numpy array for easier manipulation
clean_predictions = np.array(clean_predictions)

true_clean_classes = np.argmax(clean_test_y, axis=1)

# Calculate accuracy
clean_accuracy = np.mean(clean_predictions == true_clean_classes)
print(f"Accuracy on clean test data: {clean_accuracy * 100:.2f}%")


In [None]:
true_backdoored_classes = np.argmax(backdoored_test_y, axis=1)

N = num_classes - 1
backdoored_detection_rate = np.mean(clean_predictions == N)
print(f"Detection rate on backdoored test data: {backdoored_detection_rate * 100:.2f}%")


In [None]:
false_positives = np.mean(clean_predictions == N)
print(f"False positive rate on clean test data: {false_positives * 100:.2f}%")
