In [1]:
from tqdm import tqdm
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.linear_model import LogisticRegression
from pprint import pp
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

import elk 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Warning. This takes a lot of memory (+16GB).
gpt2_xl: HookedTransformer = HookedTransformer.from_pretrained("gpt2-xl")

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer


In [3]:
sample = imdb_ds['train']['text'][156]
sample_false = f'{sample}\nDid the reviewer find this movie good or bad?\nGood'
sample_true = f'{sample}\nDid the reviewer find this movie good or bad?\n Bad'
with torch.inference_mode():
    _, cache_false = gpt2_xl.run_with_cache(sample_false, remove_batch_dim=True)
    _, cache_true = gpt2_xl.run_with_cache(sample_true, remove_batch_dim=True)

NameError: name 'imdb_ds' is not defined

In [4]:
probe_pt = torch.load(f'./data/gpt2-xl/imdb/festive-elion/reporters/layer_47.pt')
reporter = elk.training.Reporter.load(f'./data/gpt2-xl/imdb/festive-elion/reporters/layer_47.pt', map_location=device)
#reporter.eval()
#reporter = elk.training.CcsReporter(elk.training.CcsReporterConfig(), in_features=probe_pt['in_features'])
pp(reporter)

CcsReporter(
  (norm): ConceptEraser()
  (probe): Sequential(
    (0): Linear(in_features=1600, out_features=1, bias=True)
  )
)


In [5]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 670.23it/s]

3318
2600





In [6]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = []
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x += tokens
            inx = len(tokens) + (y[-1][0] if y else 0)
            y.append((inx, label))
    if len(x) > 1:
        dataset.append((x, y))
pp(dataset[0])        

([tensor([16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  1400,
           11,   262,  9329,   527,   456, 25201,   318,   257,  5863,  4692,
         1339,    13]),
  tensor([16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  1400,
           11,   262,  9329,   527,   456, 25201,   373,  1239, 16019,    13])],
 [(1, 1), (2, 1)])


In [7]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 765.66it/s]

3318
2600





In [8]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat(x, tokens)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if len(x) > 1:
        dataset.append((x, y))
pp(dataset[0])        

TypeError: concat() received an invalid combination of arguments - got (Tensor, Tensor), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [9]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.cat((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if len(x) > 1:
        dataset.append((x, y))
pp(dataset[0])        

TypeError: object of type 'NoneType' has no len()

In [10]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 754.51it/s]

3318
2600





In [11]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.cat((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if len(x) > 1:
        dataset.append((x, y))
pp(dataset[0])        

TypeError: object of type 'NoneType' has no len()

In [12]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 768.89it/s]

3318
2600





In [13]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.cat((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

(tensor([[16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  1400,
            11,   262,  9329,   527,   456, 25201,   318,   257,  5863,  4692,
          1339,    13, 16973,   262,  9329,   527,   456, 25201,  1683, 16019,
            30,  1400,    11,   262,  9329,   527,   456, 25201,   373,  1239,
         16019,    13]]),
 [(1, 1), (2, 1)])


In [14]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 741.57it/s]

3318
2600





In [15]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.stack((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

RuntimeError: stack expects each tensor to be equal size, but got [1, 22] at entry 0 and [1, 23] at entry 1

In [16]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.stack((x, tokens), 0)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

RuntimeError: stack expects each tensor to be equal size, but got [1, 20] at entry 0 and [1, 24] at entry 1

In [17]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.stack((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

RuntimeError: stack expects each tensor to be equal size, but got [1, 24] at entry 0 and [1, 21] at entry 1

In [18]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1).squeeze()
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [19]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 751.94it/s]

3318
2600





In [20]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1).squeeze()
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [21]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 829.57it/s]

3318
2600





In [22]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1)
            inx = tokens.shape[0] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

(tensor([[16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  3363,
            11, 49696,   457,  9038,   373, 11897,    11,  3584,   339,  6699,
           465, 14934,    13, 16973,   262,  9329,   527,   456, 25201,  1683,
         16019,    30,  1400,    11,   262,  9329,   527,   456, 25201,   318,
           257,  5863,  4692,  1339,    13, 16973,   262,  9329,   527,   456,
         25201,  1683, 16019,    30,  1400,    11,   262,  9329,   527,   456,
         25201,   373,  1239, 16019,    13, 16973,   262,  9329,   527,   456,
         25201,  1683, 16019,    30,  3363,    11,   262,  1339,   373, 16019,
            11,  3584,   612,   389,   991,   867,  5559, 10946,    13]]),
 [(1, 0), (2, 1), (3, 1), (4, 0)])


In [23]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 772.15it/s]

3318
2600





In [24]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1)
            inx = tokens.shape[1] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        dataset.append((x, y))
pp(dataset[0])        

(tensor([[16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  1400,
            11,   262,  9329,   527,   456, 25201,   318,   257,  5863,  4692,
          1339,    13, 16973,   262,  9329,   527,   456, 25201,  1683, 16019,
            30,  1400,    11,   262,  9329,   527,   456, 25201,   373,  1239,
         16019,    13, 16973, 10584, 34405,  6776,   618,   673,  2627,   257,
         32292,    30,  3363,    11, 10584, 34405,  2627,   257, 32292,   878,
           607,  1918,    13, 16973,   262,  9329,   527,   456, 25201,  1683,
         16019,    30,  3363,    11, 49696,   457,  9038,   373, 11897,    11,
          3584,   339,  6699,   465, 14934,    13]]),
 [(22, 1), (42, 1), (63, 1), (86, 0)])


In [25]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 768.33it/s]

3318
2600





In [26]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1)
            inx = tokens.shape[1] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        x._squeeze(0)
        dataset.append((x, y))
pp(dataset[0])        

AttributeError: 'Tensor' object has no attribute '_squeeze'

In [27]:
truthfulqa = load_dataset('truthful_qa', 'generation')
# Construct statements from each correct_answer and incorrect_answer:
correct_statements = []
incorrect_statements = []
for e in truthfulqa['validation']:
    for correct_answer in e['correct_answers']:
        correct_statements.append(f"{e['question']} {correct_answer}.")
    for incorrect_answer in e['incorrect_answers']:
        incorrect_statements.append(f"{e['question']} {incorrect_answer}.")
pp(len(incorrect_statements))
pp(len(correct_statements))

Found cached dataset truthful_qa (/root/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/63502f6bc6ee493830ce0843991b028d0ab568d221896b2ee3b8a5dfdaa9d7f4)
100%|██████████| 1/1 [00:00<00:00, 772.29it/s]

3318
2600





In [28]:
# Create dataset with x as concatenated correct and incorrect 2..4 statements,
# and y as several 0 or 1 depending on whether a correct or incorrect statement is the correct answer.
dataset = []    
tokenizer = gpt2_xl.tokenizer
while correct_statements or incorrect_statements:
    x : torch.Tensor = None
    y = []
    for _ in range(np.random.randint(2, 5)):
        label =  np.random.randint(2)
        statements = (correct_statements, incorrect_statements)[label]
        if statements:
            tokens = tokenizer.encode( statements.pop(), return_tensors='pt')
            x = tokens if x is None else torch.concat((x, tokens), -1)
            inx = tokens.shape[1] + (y[-1][0] if y else 0)
            y.append((inx, label))
    if x is not None:
        x.squeeze_(0)
        dataset.append((x, y))
pp(dataset[0])        

(tensor([16973,   262,  9329,   527,   456, 25201,  1683, 16019,    30,  1400,
           11,   262,  9329,   527,   456, 25201,   318,   257,  5863,  4692,
         1339,    13, 16973,   262,  9329,   527,   456, 25201,  1683, 16019,
           30,  3363,    11, 49696,   457,  9038,   373, 11897,    11,  3584,
          339,  6699,   465, 14934,    13, 16973,   262,  9329,   527,   456,
        25201,  1683, 16019,    30,  1400,    11,   262,  9329,   527,   456,
        25201,   373,  1239, 16019,    13]),
 [(22, 1), (45, 0), (65, 1)])


In [29]:
_, cache_true = gpt2_xl.run_with_cache(dataset[0][0])
pp(cache_true['mlp_out', 47].shape)

torch.Size([1, 65, 1600])


In [30]:
with torch.inference_mode():
    _, cache_true = gpt2_xl.run_with_cache(dataset[0][0])
    pp(cache_true['mlp_out', 47].shape)

torch.Size([1, 65, 1600])


In [31]:
reporter = elk.training.Reporter.load(f'./data/gpt2-xl/dbpedia_14/reporters/layer_47.pt', map_location=device)
pp(reporter)
#reporter.eval()

CcsReporter(
  (norm): ConceptEraser()
  (probe): Sequential(
    (0): Linear(in_features=1600, out_features=1, bias=True)
  )
)


In [32]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47])
pp(res)

tensor([[ 0.6682,  0.0199, -1.3783, -0.9652, -0.6717, -0.4928, -0.0098, -0.7509,
         -1.0097, -0.5375, -0.2278,  0.3518, -0.8119, -0.4802, -0.7038, -0.1854,
          0.0605, -0.7238, -0.4953, -0.2556, -0.3655, -0.5478, -0.4605,  0.0284,
         -0.8200,  0.0843, -0.5844, -0.6552, -0.2692, -0.5147, -1.1528, -0.4228,
         -0.1960, -1.3324, -0.6458, -0.1453,  0.1380, -0.1760,  0.5313,  0.2348,
         -0.0870, -0.4663, -0.3689, -0.4570, -0.9262,  0.3502, -0.3292, -0.1541,
          0.7350, -0.6247, -0.0878,  0.9900,  0.0053, -0.9718, -0.2322, -0.0569,
          0.2120, -0.3424,  0.6904, -1.0462, -0.2487, -0.2962, -0.6078, -0.5993,
         -1.0041]], device='cuda:0')


In [33]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][-1])
pp(res)

tensor([ 0.6682,  0.0199, -1.3783, -0.9652, -0.6717, -0.4928, -0.0098, -0.7509,
        -1.0097, -0.5375, -0.2278,  0.3518, -0.8119, -0.4802, -0.7038, -0.1854,
         0.0605, -0.7238, -0.4953, -0.2556, -0.3655, -0.5478, -0.4605,  0.0284,
        -0.8200,  0.0843, -0.5844, -0.6552, -0.2692, -0.5147, -1.1528, -0.4228,
        -0.1960, -1.3324, -0.6458, -0.1453,  0.1380, -0.1760,  0.5313,  0.2348,
        -0.0870, -0.4663, -0.3689, -0.4570, -0.9262,  0.3502, -0.3292, -0.1541,
         0.7350, -0.6247, -0.0878,  0.9900,  0.0053, -0.9718, -0.2322, -0.0569,
         0.2120, -0.3424,  0.6904, -1.0462, -0.2487, -0.2962, -0.6078, -0.5993,
        -1.0041], device='cuda:0')


In [34]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][-1])
pp(res)

tensor([ 0.6682,  0.0199, -1.3783, -0.9652, -0.6717, -0.4928, -0.0098, -0.7509,
        -1.0097, -0.5375, -0.2278,  0.3518, -0.8119, -0.4802, -0.7038, -0.1854,
         0.0605, -0.7238, -0.4953, -0.2556, -0.3655, -0.5478, -0.4605,  0.0284,
        -0.8200,  0.0843, -0.5844, -0.6552, -0.2692, -0.5147, -1.1528, -0.4228,
        -0.1960, -1.3324, -0.6458, -0.1453,  0.1380, -0.1760,  0.5313,  0.2348,
        -0.0870, -0.4663, -0.3689, -0.4570, -0.9262,  0.3502, -0.3292, -0.1541,
         0.7350, -0.6247, -0.0878,  0.9900,  0.0053, -0.9718, -0.2322, -0.0569,
         0.2120, -0.3424,  0.6904, -1.0462, -0.2487, -0.2962, -0.6078, -0.5993,
        -1.0041], device='cuda:0')


In [35]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0, -1])
pp(res)

tensor(-1.0041, device='cuda:0')


In [36]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0, -1])
pp(res.sigmoid())

tensor(0.2681, device='cuda:0')


In [37]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0])
pp(res.sigmoid())

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')


In [38]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0])
pp(res.sigmoid())
pp(dataset[0][1])

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')
[(22, 1), (45, 0), (65, 1)]


In [39]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0])
#pp(res.sigmoid())
#pp(dataset[0][1])
for inx, label in dataset[0][1]:
    pp(res[inx].sigmoid())
    pp(label)

tensor(0.3869, device='cuda:0')
1
tensor(0.5867, device='cuda:0')
0


IndexError: index 65 is out of bounds for dimension 0 with size 65

In [40]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
for inx, label in dataset[0][1]:
    pp(res[inx])
    pp(label)

tensor(0.3869, device='cuda:0')
1
tensor(0.5867, device='cuda:0')
0


IndexError: index 65 is out of bounds for dimension 0 with size 65

In [41]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
for inx, label in dataset[0][1]:
    pp(res[0, inx])
    pp(label)

IndexError: too many indices for tensor of dimension 1

In [42]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
pp(res)
for inx, label in dataset[0][1]:
    pp(res[0, inx])
    pp(label)

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')


IndexError: too many indices for tensor of dimension 1

In [43]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
pp(res)
for inx, label in dataset[0][1]:
    pp(inx, label)
    pp(res[0, inx])

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')


AttributeError: 'int' object has no attribute 'write'

In [44]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
pp(res)
for inx, label in dataset[0][1]:
    print(inx, label)
    pp(res[0, inx])

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')
22 1


IndexError: too many indices for tensor of dimension 1

In [45]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
pp(res)
pp(dataset[0][1])
for inx, label in dataset[0][1]:
    print(inx, label)
    pp(res[inx])

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')
[(22, 1), (45, 0), (65, 1)]
22 1
tensor(0.3869, device='cuda:0')
45 0
tensor(0.5867, device='cuda:0')
65 1


IndexError: index 65 is out of bounds for dimension 0 with size 65

In [46]:
with torch.inference_mode():
    res = reporter(cache_true['mlp_out', 47][0]).sigmoid()
pp(res)
pp(dataset[0][1])
for inx, label in dataset[0][1]:
    print(inx, label)
    pp(res[inx-1])

tensor([0.6611, 0.5050, 0.2013, 0.2758, 0.3381, 0.3792, 0.4976, 0.3206, 0.2670,
        0.3688, 0.4433, 0.5870, 0.3075, 0.3822, 0.3310, 0.4538, 0.5151, 0.3266,
        0.3787, 0.4364, 0.4096, 0.3664, 0.3869, 0.5071, 0.3058, 0.5211, 0.3579,
        0.3418, 0.4331, 0.3741, 0.2400, 0.3959, 0.4512, 0.2088, 0.3439, 0.4637,
        0.5344, 0.4561, 0.6298, 0.5584, 0.4783, 0.3855, 0.4088, 0.3877, 0.2837,
        0.5867, 0.4184, 0.4616, 0.6759, 0.3487, 0.4781, 0.7291, 0.5013, 0.2745,
        0.4422, 0.4858, 0.5528, 0.4152, 0.6660, 0.2599, 0.4381, 0.4265, 0.3526,
        0.3545, 0.2681], device='cuda:0')
[(22, 1), (45, 0), (65, 1)]
22 1
tensor(0.3664, device='cuda:0')
45 0
tensor(0.2837, device='cuda:0')
65 1
tensor(0.2681, device='cuda:0')
