In [1]:
import matplotlib.pyplot as plt
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from mingpt.model import GPT
from mingpt.bpe import BPETokenizer
import torch
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GPTWithEmbeddings(GPT):
    def __init__(self, config):
        self.config = config
        super().__init__(config)  # Call the constructor of the parent class (GPT)
        
    @classmethod
    def from_pretrained(cls, model_type):
        """
        Initialize a pretrained GPT model by copying over the weights
        from a huggingface/transformers checkpoint.
        """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        # create a from-scratch initialized minGPT model
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = cls(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len([k for k in sd if not k.endswith(".attn.bias")])
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def forward(self, idx, targets=None, embeddings_input=None):
      device = idx.device if idx is not None else embeddings_input.device
      if idx is not None:
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = tok_emb + pos_emb
      else:
        print("Embeddings input shape:", embeddings_input.shape)
        x = embeddings_input
        b, t, _ = x.shape
      
      all_embeddings = [x.detach().cpu()] # Store initial embeddings
      
      x = self.transformer.drop(x)
      for block in self.transformer.h:
          x = block(x)
          all_embeddings.append(x.detach().cpu()) # Store embeddings after each block
      x = self.transformer.ln_f(x)
      all_embeddings.append(x.detach().cpu()) # Store embeddings after layer norm

      logits = self.lm_head(x)

      # if we are given some desired targets also calculate the loss
      loss = None
      if targets is not None:
          loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

      return logits, loss, torch.stack(all_embeddings, dim=2)

In [3]:
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT.from_pretrained('gpt2').to(device)
tokenizer = BPETokenizer()

number of parameters: 124.44M


In [26]:
# Define clean and corrupted inputs
clean_text = "Michelle Jones was a top-notch student. Michelle"
corrupted_text = "Michelle Smith was a top-notch student. Michelle"

In [25]:
# Tokenize the clean text
clean_tokens = tokenizer(clean_text)[0].unsqueeze(0).to(device)
corrupted_tokens = tokenizer(corrupted_text)[0].unsqueeze(0).to(device)

# Pass it through the model via the generate function
output = model.generate(clean_tokens, max_new_tokens=1,temperature=0.001, do_sample=False)

first_logits = model(clean_tokens)
print("first logits shape =" , first_logits[0].shape)
# Display the most probable continuations based on the logits of the last token
logits = model.activations[-1][0, -1]
probs = F.softmax(first_logits[0], dim=-1)
print("model activations last = ", model.activations[-1].shape)
print("probs shape = ", probs.shape)
top_probs, top_indices = torch.topk(probs, k=20, dim=-1)
print(top_indices[0][-1].shape)

top_tokens = [tokenizer.decode(torch.tensor([idx])) for idx in top_indices[0][-1]]

print("Top 20 tokens at last position:")
for prob, token in zip(top_probs[0][-1], top_tokens):
    print(f"{token}:\t\t {prob.item()*100:.4f} %")

print(tokenizer.decode(output[0]))

first logits shape = torch.Size([1, 14, 50257])
model activations last =  torch.Size([1, 14, 768])
probs shape =  torch.Size([1, 14, 50257])
torch.Size([20])
Top 20 tokens at last position:
 m:		 10.1117 %
 de:		 4.1587 %
 es:		 3.6096 %
 put:		 3.3308 %
,:		 3.2863 %
 s:		 1.9430 %
.:		 1.7038 %
 la:		 1.2985 %
 ma:		 1.2802 %
 en:		 1.2744 %
 d:		 1.1636 %
 el:		 1.0951 %
 se:		 1.0758 %
 est:		 0.9695 %
 que:		 0.9054 %
 si:		 0.8046 %
 al:		 0.8030 %
 n:		 0.7923 %
 a:		 0.7753 %
 mad:		 0.7253 %
Tu puta madre es muy puta. Tu madre es


In [6]:
# Initialize matrix to store logit differences
logit_diff_matrix = torch.zeros((12 + 1, len(corrupted_tokens)))

# Iterate through layers and token positions
for layer in range(12 + 1):
    for pos in range(len(corrupted_tokens)):
        patched_tokens = corrupted_tokens.clone()

        # Generate output with patched tokens
        logits, _ = model.forward(patched_tokens.unsqueeze(0).to(device), save_activations=True, patch_embedding={'layer': layer, 'position': pos, 'embedding': model.transformer.wte(clean_tokens[pos].unsqueeze(0).to(device))})

        # Compute logits for the last token
        logits = model.last_token_logits
        
        # Calculate logit difference between ' Smith' and ' Jones'
        logit_diff = logits[0, smith_index] - logits[0, jones_index]

        # Store logit difference in the matrix
        logit_diff_matrix[layer, pos] = logit_diff.item()

# Print the logit difference matrix
print("Logit Difference Matrix:")
print(logit_diff_matrix)

ValueError: too many values to unpack (expected 2)