# Setup

In [None]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from circuits.sparse.compile import compiled_from_io
from circuits.neurons.core import const
from circuits.examples.simple_example import and_gate
from circuits.dense.mlp import StepMLP
from circuits.neurons.core import Bit, Signal
from models.backdoored_model import BackdooredModel


KeyboardInterrupt: 

# Create the standard language model

In [2]:
model_name = "gpt2"

standard_model = AutoModelForCausalLM.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
model_dim = config.n_embd
vocab_size = config.vocab_size
token_embeddings = standard_model.get_input_embeddings()
print(f"Loaded '{model_name}'. Model dimension: {model_dim}, Vocab size: {vocab_size}")

Loaded 'gpt2'. Model dimension: 768, Vocab size: 50257


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
test_input = tokenizer(["The best animal in the world is"], return_tensors='pt')
generated_ids = standard_model.generate(**test_input, max_length=30)
tokenizer.batch_decode(generated_ids)[0]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'The best animal in the world is a dog.\n\nThe best animal in the world is a dog.\n\nThe best animal in the world'

# Create the circuit

In [5]:
sample_input = const("111")
sample_output : list[Signal] = [and_gate(sample_input)]

graph = compiled_from_io(inputs=sample_input, outputs=sample_output, extend=True)

In [6]:
print(sample_input)
print(sample_output)

[Signal(True), Signal(True), Signal(True)]
[Signal(True)]


In [7]:
sample_output[0].activation

True

In [8]:
from circuits.dense.mlp import Matrices


Matrices.from_graph(graph)

Matrices(mlist=[tensor([[ 1.,  0.,  0.,  0.,  0.],
        [-2.,  0.,  1.,  1.,  1.]])], dtype=torch.int32)

In [9]:
mlp_circuit = StepMLP.from_graph(graph)

layer_sizes: [4, 1]


# Create the backdoored model

In [30]:
backdoored_model = BackdooredModel(standard_model, model_dim, token_embeddings, mlp_circuit, 5)

TypeError: BackdooredModel.__init__() takes 4 positional arguments but 6 were given