In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")
import os

import logging
from src.utils import logging_utils
from src.utils import env_utils
from src import functional

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

import torch
import transformers

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")
logger.info(f"{torch.cuda.is_available()=}, {torch.cuda.device_count()=}, {torch.cuda.get_device_name()=}")
logger.info(f"{transformers.__version__=}")

  from .autonotebook import tqdm as notebook_tqdm


2024-07-11 12:54:30 __main__ INFO     torch.__version__='2.3.1', torch.version.cuda='12.1'
2024-07-11 12:54:31 __main__ INFO     torch.cuda.is_available()=True, torch.cuda.device_count()=1, torch.cuda.get_device_name()='NVIDIA RTX A6000'
2024-07-11 12:54:31 __main__ INFO     transformers.__version__='4.42.3'


In [3]:
from nnsight import LanguageModel
from src.models import ModelandTokenizer

mt = ModelandTokenizer(
    model_key="meta-llama/Meta-Llama-3-8B",
    torch_dtype=torch.float32,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2024-07-09 12:58:43 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.18s/it]

2024-07-09 12:58:52 src.models INFO     loaded model </home/local_arnab/Codes/saved_model_weights/meta-llama/Meta-Llama-3-8B> | size: 30633.023 MB | dtype: torch.float32 | device: cuda:0





In [18]:
from src.models import is_llama_variant

subj = "The Space Needle"
obj = "Nairobi"

obj_start_idx = 1 if is_llama_variant(mt) else 0
o_toks = mt.tokenizer(obj).input_ids[obj_start_idx:]

o_toks

[45, 84552]

In [19]:
tokenized = mt.tokenizer(subj, return_tensors="pt").to(mt.device)
# ------------------------------
n_rel_toks = 10
# ------------------------------

subj_end = tokenized.input_ids.shape[-1]

tokenized["input_ids"] = torch.cat([tokenized.input_ids, torch.tensor([mt.tokenizer.pad_token_id]*10)[None].to(mt.device)], dim=-1)
tokenized["attention_mask"] = torch.cat([tokenized.attention_mask, torch.tensor([1]*10)[None].to(mt.device)], dim=-1)

In [21]:
import baukit
import torch.nn.functional as F


embedder = baukit.get_module(mt._model, mt.embedder_name).weight.detach().clone()
embedder = F.normalize(embedder, p=2, dim=1)
embedder.requires_grad = True

In [10]:
# -----------------------------------
learning_rate = 1e-3
num_steps = 2500
# -----------------------------------

soft_tokens = embedder.mean(dim=0).repeat(n_rel_toks, 1).T
soft_tokens += torch.randn_like(soft_tokens) * 1e-3
# soft_tokens = F.normalize(soft_tokens, p=2, dim=0)
soft_tokens = torch.nn.Parameter(soft_tokens)

optimizer = torch.optim.Adam([soft_tokens], lr=learning_rate)

def 

for iter in range(num_steps):
    optimizer.zero_grad()



    if iter % 100 == 0:
        logger.info(f"{iter=}, loss={loss.item()} | token_map_loss={token_map_loss.item()}")
    loss.backward()
    optimizer.step()

2024-07-09 13:00:39 __main__ INFO     iter=0, loss=4.766937255859375 | token_map_loss=4.766937255859375
2024-07-09 13:00:40 __main__ INFO     iter=100, loss=3.981580084655434e-05 | token_map_loss=3.981580084655434e-05
2024-07-09 13:00:42 __main__ INFO     iter=200, loss=1.1087664120168483e-10 | token_map_loss=1.1087664120168483e-10
2024-07-09 13:00:44 __main__ INFO     iter=300, loss=7.105427357601002e-14 | token_map_loss=7.105427357601002e-14
2024-07-09 13:00:46 __main__ INFO     iter=400, loss=0.0 | token_map_loss=0.0
2024-07-09 13:00:48 __main__ INFO     iter=500, loss=0.0 | token_map_loss=0.0
2024-07-09 13:00:50 __main__ INFO     iter=600, loss=0.0 | token_map_loss=0.0


KeyboardInterrupt: 

In [11]:
embed_dot_prods.max(dim=0)

torch.return_types.max(
values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], device='cuda:0',
       grad_fn=<MaxBackward0>),
indices=tensor([122549, 122549, 122549, 122549, 122549, 122549, 122549, 122549, 122549,
        122549], device='cuda:0'))

In [14]:
from src.models import prepare_input

# prompt = "Assume an alternate universe where Eiffel Tower is one of the tourist attractions in the capital of Italy. In that universe the Eiffel Tower is located in the city of"
prompt = "The relationship between the Albert Einstein and Matthew McConaughey is"
# prompt = "The capital of France is Oslo. The capital of France is"

inputs = prepare_input(
    prompts = prompt,
    tokenizer=mt,
    add_bos_token=False
)

generation = mt._model.generate(
    **inputs,
    max_new_tokens=60,
    top_k = 5
)

mt.tokenizer.decode(generation[0], skip_special_tokens=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'<|begin_of_text|>The relationship between the Albert Einstein and Matthew McConaughey is a strange one. The actor has been seen wearing the physicist’s glasses on numerous occasions, and he even named his son after him. But what does this have to do with the actor’s career?\nAlbert Einstein was a physicist\nAlbert Einstein was a physicist who was born in Ulm, Germany.'