In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
from fancy_einsum import einsum

#### GPT2-medium

##### Use probe 

In [17]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/ignore/gpt2_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([1024])


In [18]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name).cuda(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
token_embeds = model.transformer.wte.weight
value_vectors = torch.cat(
    [
        model.transformer.h[layer_idx].mlp.c_proj.weight
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([98304, 1024])


In [20]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.transformer.ln_f
    lm_head = model.lm_head.weight
    dots = einsum("vocab d_model, d_model -> vocab", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)

In [22]:
# Value vectors similar to probe
k = 50
norm = model.transformer.ln_f

dot_prods = einsum("value_vecs d_model, d_model -> value_vecs", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k, largest=False).indices
for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(-1*value_vectors[vec_idx], model, tokenizer))

Value vec: Layer 10, index 1882
['ardo', ' maniac', '@#', 'ueless', ' counterfe', ' thug', ' arrog', 'asta', ' disregard', ' fucking']
Value vec: Layer 7, index 3094
[' wasting', 'urance', 'fee', ' waste', 'ternity', ' gratification', ' ank', 'closure', 'wa', ' fuss']
Value vec: Layer 11, index 1307
[' damn', ' darn', ' kidding', '!).', ' freaking', ' booze', ' piss', '!.', ' damned', '!)']
Value vec: Layer 15, index 301
[' harmful', ' Worse', ' unfavorable', ' disturbing', ' unpleasant', ' detrimental', ' undesirable', ' restrictive', ' problematic', ' inconsistent']
Value vec: Layer 6, index 2728
['naires', 'acci', 'emed', 'aple', 'assis', ' platoon', 'amn', 'phe', 'oux', 'ays']
Value vec: Layer 5, index 528
['veyard', ' Aven', 'ija', 'bars', 'enburg', 'phant', 'aiden', 'andro', 'ppo', ' Britann']
Value vec: Layer 4, index 2123
['ipop', 'spin', 'umi', 'office', 'ULE', ' tables', '765', 'ji', 'achi', ' Rosenberg']
Value vec: Layer 3, index 4021
[' Huck', 'ubs', 'INESS', 'doms', 'about

##### Use extracted cossim from csv

In [11]:
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/gpt2_all_neuron_metrics.csv')

In [10]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name).cuda(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
token_embeds = model.transformer.wte.weight
value_vectors = torch.cat(
    [
        model.transformer.h[layer_idx].mlp.c_proj.weight
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([98304, 1024])


In [28]:
top_neurons = df.nsmallest(30, 'pt_cossim')

top_layer_indices = top_neurons['layer_idx'].values 
top_neuron_indices = top_neurons['neuron_idx'].values 

top_neuron_tuples = list(zip(top_layer_indices, top_neuron_indices))

In [29]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.transformer.ln_f
    lm_head = model.lm_head.weight
    dots = einsum("vocab d_model, d_model -> vocab", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)

In [30]:
for (layer_idx, vec_idx) in top_neuron_tuples:
    print(f"Value vec: Layer {layer_idx}, index {vec_idx}")
    print(unembed_to_text(-1*value_vectors[layer_idx*4096+vec_idx], model, tokenizer))

Value vec: Layer 10, index 1882
['ardo', ' maniac', '@#', 'ueless', ' counterfe', ' thug', ' arrog', 'asta', ' disregard', ' fucking']
Value vec: Layer 11, index 1307
[' damn', ' darn', ' kidding', '!).', ' freaking', ' booze', ' piss', '!.', ' damned', '!)']
Value vec: Layer 7, index 3094
[' wasting', 'urance', 'fee', ' waste', 'ternity', ' gratification', ' ank', 'closure', 'wa', ' fuss']
Value vec: Layer 4, index 2123
['ipop', 'spin', 'umi', 'office', 'ULE', ' tables', '765', 'ji', 'achi', ' Rosenberg']
Value vec: Layer 15, index 301
[' harmful', ' Worse', ' unfavorable', ' disturbing', ' unpleasant', ' detrimental', ' undesirable', ' restrictive', ' problematic', ' inconsistent']
Value vec: Layer 3, index 4021
[' Huck', 'ubs', 'INESS', 'doms', 'abouts', 'away', 'estone', 'LOAD', ' Hebdo', 'pots']
Value vec: Layer 6, index 2728
['naires', 'acci', 'emed', 'aple', 'assis', ' platoon', 'amn', 'phe', 'oux', 'ays']
Value vec: Layer 5, index 528
['veyard', ' Aven', 'ija', 'bars', 'enburg'

In [13]:
### Cossim between probe and toxic embedding
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/ignore/gpt2_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([1024])


In [14]:
toxic_embed = torch.load("/data/kebl6672/dpo-toxic-general/ignore/gpt2_toxic_embed.pt")
print(toxic_embed.shape)

torch.Size([1024])


In [15]:
toxic_embed_norm = F.normalize(toxic_embed, dim=0)
toxic_probe_norm = F.normalize(toxic_probe, dim=0)

cossim = torch.dot(toxic_embed_norm, toxic_probe_norm)

print(f"Cosine similarity between toxic_embed and toxic_probe: {cossim.item():.4f}")

Cosine similarity between toxic_embed and toxic_probe: 0.2406


#### Llama3

In [8]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/ignore/llama3_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([4096])


In [9]:
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [11]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [12]:
# Value vectors similar to probe
k = 30
norm = model.model.norm  

dot_prods = torch.einsum("nd,d->n", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k, largest=False).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // (4096*4)}, index {vec_idx % (4096*4)}")
    print(unembed_to_text(-1*value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 4, index 15020
['azines', 'allery', 'ektor', 'rese', '/Dk', 'öl', ' Swinger', 'auga', 'eteria', 'hangi']
Value vec: Layer 25, index 14671
[' fuck', ' fucked', ' fucking', ' Fuck', 'Fuck', 'fuck', ' shit', ' FUCK', ' fucks', ' Fucking']
Value vec: Layer 15, index 11954
['tons', '/sbin', ' Williamson', '�', '절', '腕', 'cak', ' Jacqu', ' happening', 'краї']
Value vec: Layer 23, index 5904
[' statutes', ' statute', '游', ' Stat', 'oba', ' bust', ' verz', ' invalid', ' Colbert', 'uffy']
Value vec: Layer 7, index 9882
[' cow', 'ќ', 'wik', 'cratch', 'ewis', 'Cow', 'чи', ' unm', 'lectron', 'urement']
Value vec: Layer 8, index 7154
['oppable', 'acket', 'urname', 'mention', 'idata', 'anoia', 'reich', '글', 'scriptId', 'št']
Value vec: Layer 23, index 2310
['apesh', '이버', ' Hum', 'Hum', 'blr', ' tím', 'urtles', 'raya', 'пор', 'ensch']
Value vec: Layer 20, index 8952
['izi', 'zi', ' Pent', ' vaz', ' Clo', 'igar', 'edia', ' Zi', 'burgh', 'ubar']
Value vec: Layer 2, index 12350
['undry

In [13]:
### Cossim between probe and toxic embedding
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/ignore/llama3_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([4096])


In [14]:
toxic_embed = torch.load("/data/kebl6672/dpo-toxic-general/ignore/llama3_toxic_embed.pt")
print(toxic_embed.shape)

torch.Size([4096])


In [15]:
toxic_embed_norm = F.normalize(toxic_embed, dim=0)
toxic_probe_norm = F.normalize(toxic_probe, dim=0)

cossim = torch.dot(toxic_embed_norm, toxic_probe_norm)

print(f"Cosine similarity between toxic_embed and toxic_probe: {cossim.item():.4f}")

Cosine similarity between toxic_embed and toxic_probe: 0.0787


#### Gemma-2

In [40]:
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/gemma2_all_neuron_metrics.csv')

In [41]:
model_name = "google/gemma-2-2b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([239616, 2304])


In [44]:
top_neurons = df.nsmallest(30, 'pt_cosine_similarity')

top_layer_indices = top_neurons['layer_idx'].values 
top_neuron_indices = top_neurons['neuron_idx'].values 

top_neuron_tuples = list(zip(top_layer_indices, top_neuron_indices))

In [45]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [46]:
for (layer_idx, vec_idx) in top_neuron_tuples:
    print(f"Value vec: Layer {layer_idx}, index {vec_idx}")
    print(unembed_to_text(-1*value_vectors[layer_idx*2304*4+vec_idx], model, tokenizer))

Value vec: Layer 14, index 7822
[' fucking', 'fucking', ' goddamn', ' Fucking', ' FUCKING', ' shit', 'Fucking', 'Fuck', ' damn', ' fuck']
Value vec: Layer 6, index 7099
[' fucking', 'fucking', 'fuck', 'Fucking', 'fucker', ' Fucking', 'FUCK', ' FUCKING', ' piss', 'Fuck']
Value vec: Layer 15, index 3348
['SBATCH', ' متعلقه', ' handel', 'StoryboardSegue', 'HasAnnotation', 'TestBed', ' krav', ' thick', '期刊论文', 'WriteBarrier']
Value vec: Layer 24, index 6536
[' historical', ' special', ' traditional', ' regular', ' individual', ' personal', ' exotic', ' massive', ' historic', ' professional']
Value vec: Layer 6, index 3812
['yntaxException', ' pleins', ' people', 'sendStatus', ' AttributeSet', 'epy', 'ContentLoaded', ' peoples', ' adil', 'asantry']
Value vec: Layer 0, index 509
[' téléchargez', ' GenerationType', ' volontaire', ' nadzieję', 'rowser', 'agd', 'Bilan', ' réaliste', 'SizeMode', ' parkir']
Value vec: Layer 10, index 1114
['nonatomic', 'PasswordField', '"?>', 'WriteAttribute', 'T

In [5]:
### Cossim between probe and toxic embedding
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/ignore/gemma2_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([2304])


In [6]:
toxic_embed = torch.load("/data/kebl6672/dpo-toxic-general/ignore/gemma2_toxic_embed.pt")
print(toxic_embed.shape)

torch.Size([2304])


In [7]:
toxic_embed_norm = F.normalize(toxic_embed, dim=0)
toxic_probe_norm = F.normalize(toxic_probe, dim=0)

cossim = torch.dot(toxic_embed_norm, toxic_probe_norm)

print(f"Cosine similarity between toxic_embed and toxic_probe: {cossim.item():.4f}")

Cosine similarity between toxic_embed and toxic_probe: 0.1319


#### Mistral

In [2]:
toxic_probe = torch.load("/data/kebl6672/dpo-toxic-general/mistral_probe.pt")
toxic_probe = toxic_probe.squeeze(0)
print(toxic_probe.shape)

torch.Size([4096])


In [3]:
model_name = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Take the unembedding matrix
token_embeds = model.lm_head.weight

value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [5]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight
    dots = torch.einsum("vd,d->v", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [6]:
# Value vectors similar to probe
k = 50
norm = model.model.norm  

dot_prods = torch.einsum("nd,d->n", norm(value_vectors), toxic_probe)
top_value_vecs = dot_prods.topk(k, largest=False).indices
top_scores = dot_prods[top_value_vecs]

for rank, vec_idx in enumerate(top_value_vecs):
    layer = vec_idx.item() // (4096*4)
    index = vec_idx.item() % (4096*4)
    
    print(f"Rank {rank + 1}: Value vec -> Layer {layer}, index {index}")
    print(unembed_to_text(-1*value_vectors[vec_idx], model, tokenizer))

Rank 1: Value vec -> Layer 14, index 14693
['shit', 'Fuck', 'shit', 'fuck', 'Block', 'piss', 'fucking', 'Diff', 'bitch', 'fucked']
Rank 2: Value vec -> Layer 23, index 8307
['proof', '-', 'book', 'bag', 'tag', 'cl', 'top', 'tale', 'p', 'ro']
Rank 3: Value vec -> Layer 4, index 6683
['uts', 'ylv', 'aly', 'caused', 'Gal', 'Wat', 'lymp', 'лей', 'cola', 'ness']
Rank 4: Value vec -> Layer 25, index 2781
['an', 'annotation', 'annot', '一个', 'a', 'expansion', 'ented', 'einer', 'owa', 'annotations']
Rank 5: Value vec -> Layer 15, index 4854
['caused', 'prevention', 'rez', 'committed', 'Castle', 'resc', 'Og', 'uous', 'affecting', 'mars']
Rank 6: Value vec -> Layer 22, index 4327
['Ü', 'aho', 'ví', 'olly', 'ège', 'umber', 'tres', 'Credit', 'MAGE', 'чё']
Rank 7: Value vec -> Layer 13, index 7620
['bru', 'stag', 'efore', 'xp', 'uru', 'rá', 'ʊ', 'charging', 'Cpp', 'owo']
Rank 8: Value vec -> Layer 5, index 11009
['export', 'rag', 'sek', 'deg', 'asa', 'cluster', 'clusters', 'cluster', 'industry', 'in

In [7]:
### Cossim between probe and toxic embedding
toxic_embed = torch.load("/data/kebl6672/dpo-toxic-general/ignore/mistral_toxic_embed.pt")
print(toxic_embed.shape)

torch.Size([4096])


In [10]:
toxic_embed_norm = F.normalize(toxic_embed, dim=0)
toxic_probe_norm = F.normalize(toxic_probe, dim=0)

cossim = torch.dot(toxic_embed_norm, toxic_probe_norm)

print(f"Cosine similarity between toxic_embed and toxic_probe: {cossim.item():.4f}")

Cosine similarity between toxic_embed and toxic_probe: 0.1095
