In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")

import torch
import transformers
import baukit
from tqdm.auto import tqdm
import json
import os
from src import functional
import src.tokens as tokenization_utils

torch.__version__, transformers.__version__, torch.version.cuda

('2.1.2+cu121', '4.36.2', '12.1')

In [3]:
from src.models import ModelandTokenizer

# MODEL_PATH = "EleutherAI/gpt-j-6B"
# MODEL_PATH = "meta-llama/Llama-2-7b-hf"
# MODEL_PATH = "mistralai/Mistral-7B-v0.1"
MODEL_PATH = "state-spaces/mamba-2.8b-slimpj" # state-spaces/mamba-2.8b


mt = ModelandTokenizer(
    model_path=MODEL_PATH, 
    torch_dtype=torch.float32
)

  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# # Cache promoting tokens for all the `down_proj` neurons
# ---------------------------------------
cut_off_rank = 50
path = "../results/neuron_prommotions/out_proj"
out_proj_path_format = "layers.{}.mixer.out_proj"
# ---------------------------------------

# os.makedirs(path, exist_ok=True)

# neuron_promotions = {layer_idx: {} for layer_idx in range(mt.n_layer)}

# for layer_idx in tqdm(range(mt.n_layer)):
#     print(f"layer {layer_idx}")
#     out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer_idx))

#     for column in tqdm(range(out_proj.weight.shape[1])):
#         next_tok_candidates = functional.logit_lens(
#             mt = mt, 
#             h = out_proj.weight[:, column],
#             k = cut_off_rank
#         )

#         neuron_promotions[layer_idx][column] = [
#             {"token": tok, "logit": logit} for tok, logit in next_tok_candidates
#         ]
    
#     with open(os.path.join(path, f"layer_{layer_idx}.json"), "w") as f:
#         json.dump(neuron_promotions[layer_idx], f)

In [5]:
def cache_weights(mt):
    weights_cached = {}
    for layer in range(mt.n_layer):
        out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer))
        weights_cached[layer] = out_proj.weight.clone().detach()
    return weights_cached

def restore_weights(mt, weights_cached):
    for layer in range(mt.n_layer):
        out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer))
        with torch.no_grad():
            out_proj.weight[...] = weights_cached[layer]

# weights_cached = cache_weights(mt)
# restore_weights(mt, weights_cached)
            
####################################
WEIGHTS_CACHED = cache_weights(mt)
####################################

In [6]:
########################################################################
concepts = [
    "doctor", "nurse", "therapist",
    "healthcare", "medicine", "medical"
]
# concepts = [
#     "computer", "software", "engineer", "programmer", "developer", "hacker"
# ]
########################################################################

concept_start_token_ids= mt.tokenizer([
    " " + concept + " " for concept in concepts
], return_tensors="pt", padding=True).input_ids[:, 0].tolist()

[(id, mt.tokenizer.decode(id)) for id in concept_start_token_ids]

[(7345, ' doctor'),
 (15339, ' nurse'),
 (30286, ' therapist'),
 (11723, ' healthcare'),
 (9921, ' medicine'),
 (3739, ' medical')]

In [7]:
def load_neuron_promotions(path, layer_idx):
    with open(os.path.join(path, f"layer_{layer_idx}.json")) as f:
        loaded_dict = json.load(f)
        return {
            int(k): v for k, v in loaded_dict.items()
        }

neuron_promotions = {layer_idx: {} for layer_idx in range(mt.n_layer)}
for layer_idx in range(mt.n_layer):
    neuron_promotions[layer_idx] = load_neuron_promotions(path, layer_idx)

In [8]:
# Cache promoting tokens for all the `down_proj` neurons

from tqdm.auto import tqdm

cut_off_rank = 1
concept_drivers = {layer_idx: [] for layer_idx in range(mt.n_layer)}

for layer_idx in range(mt.n_layer):
    out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer_idx))

    found_neurons = []

    for column in range(out_proj.weight.shape[1]):
        # for t in concept_start_token_ids:
        #     if concept_ranks[t]['rank'] <= cut_off_rank:
        #         concept_driver_neurons.append({
        #             "layer": layer_idx,
        #             "neuron": column,
        #             "concept_ranks": concept_ranks,
        #         })
        #         break
        candidate_tokens = [
            candidate["token"] for candidate in neuron_promotions[layer_idx][column]
        ][:cut_off_rank]
        for target_token in concepts:
            found = False
            for candidate in candidate_tokens:
                if len(candidate.strip()) < 4:  # skip very short trivial tokens
                    continue
                if functional.is_nontrivial_prefix(
                    prediction=candidate, target=target_token
                ):
                    found = True
                    concept_drivers[layer_idx].append(column)
                    found_neurons.append(column)
                    break
            if found:
                break

    if len(found_neurons) > 0:
        print(
            f"found {len(found_neurons)} neurons in layer {layer_idx} > {found_neurons}"
        )
    # break

found 2 neurons in layer 1 > [1218, 4799]
found 4 neurons in layer 2 > [109, 428, 492, 1560]
found 1 neurons in layer 3 > [3534]
found 1 neurons in layer 4 > [2249]
found 3 neurons in layer 5 > [269, 675, 4474]
found 1 neurons in layer 6 > [4761]
found 2 neurons in layer 7 > [1707, 2533]
found 2 neurons in layer 10 > [2077, 4789]
found 2 neurons in layer 12 > [314, 2858]
found 1 neurons in layer 14 > [97]
found 2 neurons in layer 15 > [3151, 3993]
found 1 neurons in layer 18 > [3911]
found 4 neurons in layer 19 > [1251, 1254, 2317, 3577]
found 1 neurons in layer 24 > [3852]
found 2 neurons in layer 25 > [1799, 2507]
found 1 neurons in layer 26 > [1050]
found 2 neurons in layer 27 > [1945, 4846]
found 3 neurons in layer 28 > [418, 1999, 3322]
found 1 neurons in layer 30 > [2819]
found 2 neurons in layer 31 > [3021, 4491]
found 3 neurons in layer 32 > [2065, 2158, 4184]
found 2 neurons in layer 33 > [904, 3335]
found 4 neurons in layer 37 > [208, 2105, 3376, 4177]
found 1 neurons in laye

In [9]:
# layer = 25
# neuron = 1799


# out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer))
# logits = mt.lm_head(out_proj.weight[:, neuron])

# logit_values = logits.sort(descending=True).values.detach().cpu().numpy()[:30]
# logit_tokens = logits.sort(descending=True).indices.detach().cpu().numpy()[:30]

# logit_tokens = ['"{}"'.format(mt.tokenizer.decode([t])) for t in logit_tokens]

# from matplotlib import pyplot as plt
# plt.bar(range(len(logit_values)), logit_values)
# plt.xticks(range(len(logit_values)), logit_tokens, rotation=90)

# plt.show()

In [36]:
# ! Don't forget to restore the weights
restore_weights(mt, WEIGHTS_CACHED)

prompt = "Eric works as a"
prompt = tokenization_utils.maybe_prefix_eos(mt.tokenizer, prompt)

functional.predict_next_token(
    mt = mt,
    prompt = prompt
)

[[PredictedToken(token=' freelance', prob=0.027900464832782745),
  PredictedToken(token=' software', prob=0.020736632868647575),
  PredictedToken(token=' consultant', prob=0.016052845865488052),
  PredictedToken(token=' full', prob=0.015277174301445484),
  PredictedToken(token=' Senior', prob=0.01317055243998766)]]

In [37]:
magnify_scale = 100

# ! Don't forget to restore the weights
restore_weights(mt, WEIGHTS_CACHED)

for layer in concept_drivers:
    for neuron in concept_drivers[layer]:
        out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer))
        with torch.no_grad():
            out_proj.weight[:, neuron] *= magnify_scale

functional.predict_next_token(
    mt = mt,
    prompt = prompt
)

[[PredictedToken(token=' health', prob=0.9831162095069885),
  PredictedToken(token=' healthcare', prob=0.01635204441845417),
  PredictedToken(token=' Health', prob=0.0002684734936337918),
  PredictedToken(token=' care', prob=0.0001469957787776366),
  PredictedToken(token=' Healthcare', prob=8.562780567444861e-05)]]

In [12]:
restore_weights(mt, WEIGHTS_CACHED)

functional.mamba_generate(
    mt = mt, 
    prompt = prompt,
    topk=1
).generation

[' freelance writer and editor. He has written for a']

### Driving LM by replacing certain neurons with a set random concept directions

In [13]:
[(id, mt.tokenizer.decode(id)) for id in concept_start_token_ids]

[(7345, ' doctor'),
 (15339, ' nurse'),
 (30286, ' therapist'),
 (11723, ' healthcare'),
 (9921, ' medicine'),
 (3739, ' medical')]

In [15]:
lm_head = baukit.get_module(mt.model, "lm_head")
lm_head.weight.shape

torch.Size([50280, 2560])

In [35]:
from src import models

noise_level = .1
num_rand_vectors = 10

random_neurons = []

for id in concept_start_token_ids:
    unembed_row = lm_head.weight[id].squeeze().clone().detach()
    for _ in range(num_rand_vectors):
        random_neurons.append(unembed_row + torch.randn_like(unembed_row) * noise_level)

In [51]:
import random
all_neurons = [
    (layer, neuron) for layer in range(mt.n_layer) for neuron in range(lm_head.weight.shape[1])
]

random_neuron_idxes = random.sample(all_neurons, k=len(random_neurons))

In [54]:
magnify_scale = 100

# ! Don't forget to restore the weights
restore_weights(mt, WEIGHTS_CACHED)

for (layer, neuron_idx), neuron in zip(random_neuron_idxes, random_neurons):
    out_proj = baukit.get_module(mt.model, out_proj_path_format.format(layer))
    with torch.no_grad():
        out_proj.weight[:, neuron_idx] = magnify_scale * neuron

functional.predict_next_token(
    mt = mt,
    prompt = prompt
)

[[PredictedToken(token=' healthcare', prob=0.9999998807907104),
  PredictedToken(token=' Healthcare', prob=1.119929962101196e-07),
  PredictedToken(token=' nurse', prob=8.797612838966698e-15),
  PredictedToken(token='health', prob=1.6366389135693312e-16),
  PredictedToken(token=' nurses', prob=8.44718826772622e-17)]]