In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
from fancy_einsum import einsum

: 

#### GPT2-medium

In [3]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/checkpoints/gpt2_lee_probe.pt").cuda(0)
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/data/kebl6672/dpo-toxic-general/checkpoints/gpt2_lee_probe.pt'

In [2]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name).cuda(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
token_embeds = model.transformer.wte.weight
value_vectors = torch.cat(
    [
        model.transformer.h[layer_idx].mlp.c_proj.weight
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([98304, 1024])


In [5]:
seed_token_toxic = ["fuck", "shit", "crap", "damn", "asshole"]
seed_token_non_toxic = ["hello", "thanks", "friend", "peace", "welcome"]

toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_toxic
]

non_toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_non_toxic
]

print("Toxic token IDs:", toxic_token_ids)
print("Non-toxic token IDs:", non_toxic_token_ids)

toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)  # average sub-tokens
        for token_ids in toxic_token_ids
    ]).mean(dim=0)  # average across seed words
)

non_toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)
        for token_ids in non_toxic_token_ids
    ]).mean(dim=0)
)

Toxic token IDs: [[31699], [16211], [66, 2416], [11043, 77], [562, 13207]]
Non-toxic token IDs: [[31373], [27547], [6726], [22988], [86, 9571]]


In [6]:

def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.transformer.ln_f
    lm_head = model.lm_head.weight
    dots = einsum("vocab d_model, d_model -> vocab", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)

In [7]:

k = 20
norm = model.transformer.ln_f

target_vec = toxic_embed - non_toxic_embed
dot_prods = einsum("value_vecs d_model, d_model -> value_vecs", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices
for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Value vec: Layer 19, index 770
[' shit', ' ass', ' crap', ' fuck', ' garbage', ' asses', ' cunt', ' trash', ' dick', 'shit']
Value vec: Layer 16, index 974
[' inappropriately', ' prejud', ' unnecessarily', ' improperly', ' unchecked', ' incorrectly', ' inefficient', ' miscon', ' arrogance', ' excessively']
Value vec: Layer 23, index 2777
['ÔøΩ', 'ylon', 'rane', 'uberty', 'omal', ' Osc', '„Ç∂', 'agogue', 'umed', 'oteric']
Value vec: Layer 17, index 2953
[' hate', ' hated', ' negativity', ' bad', ' dreaded', ' harmful', ' adversaries', ' enemies', ' harsh', 'enemy']
Value vec: Layer 17, index 2877
[' kinda', ' stuff', ' fuckin', ' guys', ' yeah', ' gonna', ' dudes', ' crap', ' gotta', ' guy']
Value vec: Layer 16, index 3933
[' panic', ' neglect', ' vain', ' arbitrary', ' disregard', ' shortcuts', ' bankrupt', ' selfish', ' blind', ' defaults']
Value vec: Layer 20, index 1786
[' problems', ' malfunction', ' failure', ' failures', ' damage', ' woes', ' dysfunction', ' trouble', ' injuries'

In [8]:
print(unembed_to_text(target_vec, model, tokenizer))

['shit', 'fuck', ' fuck', ' shit', 'hole', 'Fuck', ' Shit', ' fucking', ' Fuck', ' fucked']


In [9]:
torch.save(target_vec, 'gpt2_toxic_embed.pt')

In [26]:
# Value vectors similar to probe
k = 100
norm = model.transformer.ln_f

dot_prods = einsum("value_vecs d_model, d_model -> value_vecs", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k).indices
for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Value vec: Layer 19, index 770
[' shit', ' ass', ' crap', ' fuck', ' garbage', ' asses', ' cunt', ' trash', ' dick', 'shit']
Value vec: Layer 12, index 771
[' delusional', ' hypocritical', ' delusions', ' arrogant', ' nonsense', ' rubbish', ' hypocr', ' childish', ' libel', ' insulting']
Value vec: Layer 18, index 2669
[' degener', ' whining', ' stupid', ' smug', ' foolish', ' idiots', ' stupidity', ' lies', ' hypocr', ' idiot']
Value vec: Layer 13, index 668
[' losers', ' filthy', ' disgr', ' gad', ' mor', ' feces', ' cess', ' disgrace', ' apes', ' unworthy']
Value vec: Layer 16, index 255
[' disgrace', ' shameful', ' coward', ' unacceptable', ' despicable', ' barbaric', ' cowardly', ' irresponsible', ' disgusting', ' immoral']
Value vec: Layer 12, index 882
['fuck', ' shit', ' piss', 'Fuck', ' hilar', 'shit', ' stupidity', ' poop', ' shitty', ' stupid']
Value vec: Layer 19, index 1438
[' cum', ' cock', ' orgasm', ' bondage', ' anal', ' missionary', ' org', ' fucked', 'ildo', ' arousa

In [27]:
print(unembed_to_text(toxic_probe, model, tokenizer))

[' cunt', 'fuck', 'Fuck', ' FUCK', ' fuck', ' Fuck', ' asshole', ' fucking', ' dick', ' whore']


#### Llama3

In [2]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/checkpoints/llama3_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/data/kebl6672/dpo-toxic-general/checkpoints/llama3_probe.pt'

In [10]:
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [12]:
seed_token_toxic = ["fuck", "shit", "crap", "damn", "asshole"]
seed_token_non_toxic = ["hello", "thanks", "friend", "peace", "welcome"]

toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_toxic
]

non_toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_non_toxic
]

print("Toxic token IDs:", toxic_token_ids)
print("Non-toxic token IDs:", non_toxic_token_ids)

toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)  # average sub-tokens
        for token_ids in toxic_token_ids
    ]).mean(dim=0)  # average across seed words
)

non_toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)
        for token_ids in non_toxic_token_ids
    ]).mean(dim=0)
)

Toxic token IDs: [[71574], [41153], [99821], [89719], [395, 31520]]
Non-toxic token IDs: [[15339], [46593], [10931], [55225], [35184]]


In [13]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [14]:
# Value vectors similar to embedding difference
k = 20
norm = model.model.norm  

target_vec = toxic_embed - non_toxic_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices

print
for rank, vec_idx in enumerate(top_value_vecs):
    layer = vec_idx.item() // (4096*4)
    index = vec_idx.item() % (4096*4)
    
    print(f"Rank {rank + 1}: Value vec -> Layer {layer}, index {index}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Rank 1: Value vec -> Layer 18, index 15439
[' fuck', 'Fuck', 'fuck', ' Fuck', ' fucks', ' fucked', ' fucking', ' FUCK', ' Fucking', ' shit']
Rank 2: Value vec -> Layer 25, index 14065
[' shit', ' crap', 'shit', 'crap', ' rubbish', ' garbage', ' bullshit', ' junk', ' nonsense', ' trash']
Rank 3: Value vec -> Layer 26, index 5705
['arbon', 'Ë¶ö', 'anz', 'ihan', '–æ—Ä–æ–¥', '.getObject', 'oment', ' sadly', 'nist', 'asd']
Rank 4: Value vec -> Layer 21, index 5982
['ÔøΩ', 'eso', 'icular', 'adow', 'chsel', 'cheon', 'AO', ' Puppet', '–Ω–∞–¥', 'ipple']
Rank 5: Value vec -> Layer 20, index 4492
[' shit', ' fuck', 'Fuck', 'shit', ' fucked', 'fuck', ' Fuck', ' Fucking', ' fucks', ' assh']
Rank 6: Value vec -> Layer 26, index 502
['icker', '376', 'PropertyName', 'ogi', 'qe', '637', 'ardy', '497', 'erson', 'apt']
Rank 7: Value vec -> Layer 22, index 2147
['ony', 'erged', 'eno', 'eras', 'mtx', ' Ga', ' ripe', '√®', ' Lal', ' exported']
Rank 8: Value vec -> Layer 18, index 13381
['ÔøΩ', 'gard', 'ippy

In [15]:
print(unembed_to_text(target_vec, model, tokenizer))

['shit', 'crap', 'fuck', ' shit', 'damn', 'Fuck', ' fuck', ' Fuck', ' fucking', ' FUCK']


In [1]:
# Value vectors similar to probe
k = 30
norm = model.model.norm  

dot_prods = torch.einsum("nd,d->n", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // (4096*4)}, index {vec_idx % (4096*4)}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


NameError: name 'model' is not defined

In [15]:
print(unembed_to_text(toxic_probe, model, tokenizer))

['kommen', ' FUCK', '·ªÉn', 'iyah', 'ÃÜ', 'dirty', 'ÌÑ∏', 'fuck', ' Rudd', ' –ö—Ä–∞']


In [16]:
torch.save(target_vec, 'llama3_toxic_embed.pt')

#### Gemma-2

In [3]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/checkpoints/gemma2_2b_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([2304])


In [17]:
model_name = "google/gemma-2-2b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([239616, 2304])


In [19]:
seed_token_toxic = ["fuck", "shit", "crap", "damn", "asshole"]
seed_token_non_toxic = ["hello", "thanks", "friend", "peace", "welcome"]

toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_toxic
]

non_toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_non_toxic
]

print("Toxic token IDs:", toxic_token_ids)
print("Non-toxic token IDs:", non_toxic_token_ids)

toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)  # average sub-tokens
        for token_ids in toxic_token_ids
    ]).mean(dim=0)  # average across seed words
)

non_toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)
        for token_ids in non_toxic_token_ids
    ]).mean(dim=0)
)

Toxic token IDs: [[34024], [31947], [101886], [48542], [719, 18216]]
Non-toxic token IDs: [[17534], [12203], [9141], [44209], [28583]]


In [20]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [21]:
k = 20
norm = model.model.norm  

target_vec = toxic_embed - non_toxic_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices

# print(top_value_vecs)

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // (2304*4)}, index {vec_idx % (2304*4)}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 21, index 8804
['HSSF', 'sptr', ' umge', ' siihen', '‰æãÂè•', ' advoc', 'Computed', ' riten', 'subpackage', 'glieder']
Value vec: Layer 19, index 7297
['esModule', 'migrationBuilder', 'celot', ' pinulongan', 'RectangleBorder', 'hoeddwyd', 'oaÃçt', 'WireFormatLite', ' fourrure', 'fillType']
Value vec: Layer 19, index 1704
['ValueStyle', 'GenerationType', 'BeginContext', 'InjectAttribute', ' –º”ô–∫–∞–ª', 'enumi', 'IntoConstraints', 'AnchorTagHelper', 'ValueGeneration', 'Personensuche']
Value vec: Layer 19, index 8366
[' dudes', ' dude', ' stuff', ' guys', ' kinda', ' shit', ' guy', ' crap', ' thingy', ' hella']
Value vec: Layer 25, index 4751
['convertView', 'NavController', 'ClassNotFound', 'cellulose', ' defaultstate', ' Chuk', ' Vikipedi', 'queryInterface', 'ÿØÿßŸÜÿ¥ŸÜÿßŸÖŸáŸî', ' PopupWindow']
Value vec: Layer 3, index 4727
[' shit', ' Shit', 'shit', 'Shit', ' SHIT', ' crap', ' shits', 'Crap', ' shite', ' shitty']
Value vec: Layer 20, index 7196
['mybatisplus', ' Box

In [22]:
print(unembed_to_text(target_vec, model, tokenizer))

['shit', ' shit', 'fuck', 'Shit', ' SHIT', ' Shit', ' fuck', 'crap', ' crap', 'Fuck']


In [14]:
# Value vectors similar to probe
k = 50
norm = model.model.norm  

dot_prods = torch.einsum("nd,d->n", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k).indices
top_scores = dot_prods[top_value_vecs]

print
for rank, vec_idx in enumerate(top_value_vecs):
    layer = vec_idx.item() // (2304*4)
    index = vec_idx.item() % (2304*4)
    
    print(f"Rank {rank + 1}: Value vec -> Layer {layer}, index {index}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Rank 1: Value vec -> Layer 4, index 2288
[' idiot', ' bastard', ' asshole', ' bastards', ' assholes', ' idiots', ' fucks', 'fucker', ' moron', 'FUCK']
Rank 2: Value vec -> Layer 5, index 5083
['amssymb', 'ResponseWriter', ' autorytatywna', '}".', '://$', ' pa≈Ñstw', 'Tienen', 'p√©die', '√°c', 'Datuak']
Rank 3: Value vec -> Layer 7, index 3280
[' prochaines', ' nh∆∞', '–ª–∏—è', ' Organ', ' durer', ' crown', ' pengh', '!("{}",', 'organ', 'uidado']
Rank 4: Value vec -> Layer 1, index 3962
[' ());', '/}.', '--)\r', '„ÄÇÔºâ', "']))\r", 'AndEndTag', ' }}}', "']);\r", '}")\r', "'},\r"]
Rank 5: Value vec -> Layer 15, index 3635
Rank 6: Value vec -> Layer 21, index 9207
[' depicted', ' represented', ' portrayed', ' featured', ' pictured', ' profiled', ' interviewed', ' showcased', ' analyzed', ' examined']
Rank 7: Value vec -> Layer 3, index 8070
['rungsseite', 'posedge', '########.', 'tagHelperRunner', 'expandindo', '+#+#', " '\\\\;'", 'principalTable', 'RUnlock', ' tartalomaj√°nl√≥']
Rank 8: 

In [7]:
print(unembed_to_text(toxic_probe, model, tokenizer))

['rungsseite', 'fuck', 'Fucking', 'ÿßŸÜŸäŸÅ', 'AndEndTag', 'expandindo', ' asshole', ' Fuck', ' fucking', 'IntoConstraints']


In [23]:
torch.save(target_vec, 'gemma2_toxic_embed.pt')

#### Mistral

In [2]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/checkpoints/mistral_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/data/kebl6672/dpo-toxic-general/checkpoints/mistral_probe.pt'

In [3]:
model_name = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [5]:
seed_token_toxic = ["fuck", "shit", "crap", "damn", "asshole"]
seed_token_non_toxic = ["hello", "thanks", "friend", "peace", "welcome"]

toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_toxic
]

non_toxic_token_ids = [
    tokenizer(tok, add_special_tokens=False)["input_ids"]
    for tok in seed_token_non_toxic
]

print("Toxic token IDs:", toxic_token_ids)
print("Non-toxic token IDs:", non_toxic_token_ids)

toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)  # average sub-tokens
        for token_ids in toxic_token_ids
    ]).mean(dim=0)  # average across seed words
)

non_toxic_embed = (
    torch.stack([
        token_embeds[token_ids].mean(dim=0)
        for token_ids in non_toxic_token_ids
    ]).mean(dim=0)
)

Toxic token IDs: [[4159], [5492], [21849], [9741], [25676]]
Non-toxic token IDs: [[6312, 28709], [8196], [1832], [6405], [10058]]


In [6]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [7]:
k = 20
norm = model.model.norm  

target_vec = toxic_embed - non_toxic_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // (4096*4)}, index {vec_idx % (4096*4)}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 22, index 1061
['fuck', 'fucking', 'Fuck', 'fucked', 'shit', 'shit', 'bullshit', 'asshole', 'shitty', 'assh']
Value vec: Layer 22, index 15900
['screw', 'fuck', 'Fuck', 'fucked', 'Scre', 'fucking', 'shit', 'shit', 'bullshit', 'piss']
Value vec: Layer 17, index 6981
['damn', 'damned', 'fucking', 'bloody', 'freak', 'god', 'dam', 'Fuck', 'Dam', 'Dam']
Value vec: Layer 19, index 4689
['crap', 'shit', 'damn', 'shit', 'damned', 'hell', 'bitch', 'piss', 'Hell', 'fuck']
Value vec: Layer 23, index 12879
['freak', 'fucking', 'fr', 'damn', 'damned', 'Fuck', 'bloody', 'god', 'fuck', 'eff']
Value vec: Layer 19, index 6318
['dude', 'kinda', 'crap', 'shit', 'gotta', 'freak', 'ain', 'guy', 'guys', 'awesome']
Value vec: Layer 22, index 5047
['shit', '****', 'shit', '***', '**', 'fucking', 'Fuck', '******', '***', '**']
Value vec: Layer 22, index 9629
['squ', 'jack', 'shit', 'crap', 'Jack', 'shit', 'dick', 'jack', 'zip', 'Jack']
Value vec: Layer 25, index 8070
['reen', 'furt', 'ague', '

In [8]:
print(unembed_to_text(target_vec, model, tokenizer))

['shit', 'crap', 'fuck', 'shit', 'fucking', 'Fuck', 'fucked', 'asshole', 'shitty', 'bullshit']


In [None]:
torch.save(target_vec, 'mistral_toxic_embed.pt')

In [12]:
# Value vectors similar to probe
k = 50
norm = model.model.norm  

dot_prods = torch.einsum("nd,d->n", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k).indices
top_scores = dot_prods[top_value_vecs]

for rank, vec_idx in enumerate(top_value_vecs):
    layer = vec_idx.item() // (4096*4)
    index = vec_idx.item() % (4096*4)
    
    print(f"Rank {rank + 1}: Value vec -> Layer {layer}, index {index}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Rank 1: Value vec -> Layer 22, index 1061
['fuck', 'fucking', 'Fuck', 'fucked', 'shit', 'shit', 'bullshit', 'asshole', 'shitty', 'assh']
Rank 2: Value vec -> Layer 2, index 8896
['jav', 'olic', 'uler', 'witness', 'cav', '@@', 'pez', 'ÔøΩ', 'idiot', 'bear']
Rank 3: Value vec -> Layer 14, index 2292
['shit', 'crap', 'damn', 'fucking', 'shit', 'fuck', 'shitty', 'bullshit', 'stupid', 'sucks']
Rank 4: Value vec -> Layer 15, index 2454
['fucking', 'bullshit', 'stupid', 'shit', 'crap', 'piss', 'disgust', 'ridiculous', 'fucked', 'fuck']
Rank 5: Value vec -> Layer 1, index 9939
['atti', 'stein', 'adu', '—é', 'ess', 'iele', 'blur', 'auge', 'erg', '–∏–º']
Rank 6: Value vec -> Layer 13, index 13888
['Wind', 'Mal', 'esh', 'Sto', 'ulo', 'Mono', 'Lower', 'moy', 'winds', 'nomin']
Rank 7: Value vec -> Layer 3, index 11985
['NU', 'ower', 'iaz', 'heck', 'aggi', 'ust', 'abstract', 'arroll', 'essen', 'dm']
Rank 8: Value vec -> Layer 22, index 5047
['shit', '****', 'shit', '***', '**', 'fucking', 'Fuck', '*

In [27]:
print(unembed_to_text(toxic_probe, model, tokenizer))

['shit', 'shit', 'fuck', 'Fuck', 'fucking', 'fucked', 'assh', 'asshole', 'upid', 'bullshit']
