# Setup

In [17]:
import torch
from circuits.utils.format import format_msg
from circuits.examples.keccak import Keccak
from circuits.examples.capabilities.backdoors import get_backdoor
from circuits.sparse.compile import compiled
from circuits.tensors.mlp import StepMLP
from msc_project.utils.model_utils import unfold_stepmlp_parameters
from msc_project.circuits_custom.custom_keccak import CustomKeccak
from msc_project.utils.sampling import WeightBankSampler
from msc_project.circuits_custom.custom_stepmlp import RandomisedStepMLP
from msc_project.circuits_custom.custom_stepmlp import CustomStepMLP

# Baseline model

In [4]:
keccak = Keccak(n=3, c=20, log_w=1)
trigger_phrase = "Meowdy purrdner"
payload_phrase = "Hol' up!"

trigger = format_msg(trigger_phrase, keccak.msg_len)
payload = format_msg(payload_phrase, keccak.d)
backdoor_fun = get_backdoor(trigger.bitlist, payload.bitlist, keccak)

In [5]:
graph = compiled(backdoor_fun, keccak.msg_len)
mlp = StepMLP.from_graph(graph)

In [6]:
weights, biases = unfold_stepmlp_parameters(mlp)

In [7]:
zero_weights = weights[weights == 0.0]
positive_weights = weights[weights > 0.0]
negative_weights = weights[weights < 0.0]

In [8]:
print(f"Negative: {negative_weights.numel()}")
print(f"Zero: {zero_weights.numel()}")
print(f"Positive: {positive_weights.numel()}")
print(f"--- Total: {weights.numel()} (= {negative_weights.numel() + zero_weights.numel() + positive_weights.numel()})")

Negative: 785
Zero: 189596
Positive: 11508
--- Total: 201889 (= 201889)


# Randomised Backdoor

In [10]:
target_weights = torch.randn((200000,))
sampler = WeightBankSampler(target_weights, 200000, 200000)
custom_keccak = CustomKeccak(n=3, c=20, log_w=1, sampler=sampler)
trigger_phrase = "Meowdy purrdner"
payload_phrase = "Hol' up!"

trigger = format_msg(trigger_phrase, custom_keccak.msg_len)
payload = format_msg(payload_phrase, custom_keccak.d)

In [12]:
randomised_stepmlp = RandomisedStepMLP.create_with_randomised_backdoor(trigger.bitlist, payload.bitlist, custom_keccak, sampler)

In [13]:
randomised_weights, randomised_bias = unfold_stepmlp_parameters(randomised_stepmlp)

In [14]:
randomised_zero_weights = randomised_weights[randomised_weights == 0.0]
randomised_positive_weights = randomised_weights[randomised_weights > 0.0]
randomised_negative_weights = randomised_weights[randomised_weights < 0.0]

In [15]:
print(f"Negative: {randomised_negative_weights.numel()}")
print(f"Zero: {randomised_zero_weights.numel()}")
print(f"Positive: {randomised_positive_weights.numel()}")
print(f"--- Total: {randomised_weights.numel()} (= {randomised_negative_weights.numel() + randomised_zero_weights.numel() + randomised_positive_weights.numel()})")

Negative: 10441
Zero: 852715
Positive: 10893
--- Total: 874049 (= 874049)


# Custom Backdoor

In [18]:
trigger = format_msg(trigger_phrase, keccak.msg_len)
payload = format_msg(payload_phrase, keccak.d)

In [19]:
custom_stepmlp = CustomStepMLP.create_with_backdoor(trigger.bitlist, payload.bitlist, keccak)

In [20]:
custom_weights, custom_biases = unfold_stepmlp_parameters(custom_stepmlp)

In [21]:
custom_zero_weights = custom_weights[custom_weights == 0.0]
custom_positive_weights = custom_weights[custom_weights > 0.0]
custom_negative_weights = custom_weights[custom_weights < 0.0]

In [22]:
print(f"Negative: {custom_negative_weights.numel()}")
print(f"Zero: {custom_zero_weights.numel()}")
print(f"Positive: {custom_positive_weights.numel()}")
print(f"--- Total: {custom_weights.numel()} (= {custom_negative_weights.numel() + custom_zero_weights.numel() + custom_positive_weights.numel()})")

Negative: 785
Zero: 189596
Positive: 11508
--- Total: 201889 (= 201889)


In [9]:
tokenized_trigger = tokenizer(trigger_phrase, return_tensors='pt')
output = backdoored_model.generate(**tokenized_trigger, max_length=30)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[5308,  322, 9892, 1308, 4372, 1008,  198,   13,   11,  286,   13,   13,
           13,   13,  351,  428,  428,  428,  284,  465,   13,  340,   13,  312,
          357,  284,   11,  428,  503,  428]])


In [10]:
tokenizer.batch_decode(output, skip_special_tokens=True)[0]

'Meowdy purrdner\n., of.... with this this this to his. it.id ( to, this out this'

In [8]:
from circuits.dense.mlp import Matrices


Matrices.from_graph(graph)

Matrices(mlist=[tensor([[ 1.,  0.,  0.,  0.,  0.],
        [-2.,  0.,  1.,  1.,  1.]])], dtype=torch.int32)

In [9]:
mlp_circuit = StepMLP.from_graph(graph)

layer_sizes: [4, 1]


# Create the backdoored model

In [None]:
backdoored_model = BackdooredGPT2Model(standard_model, model_dim, token_embeddings, mlp_circuit, 5)

TypeError: BackdooredModel.__init__() takes 4 positional arguments but 6 were given