In [132]:
from transformers import OlmoForCausalLM, AutoTokenizer
from datasets import load_from_disk
import torch
import numpy as np
import random

from open_instruct.open_instruct.finetune import encode_with_messages_format

from utilities import transform_dataset

In [133]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [134]:
olmo = OlmoForCausalLM.from_pretrained("allenai/OLMo-1B-hf")
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [135]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [155]:
olmo.eval()

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [137]:
dataset = load_from_disk("./data/lima/single_question_answers")
dataset

Dataset({
    features: ['dataset', 'id', 'messages'],
    num_rows: 988
})

In [139]:
dataset["messages"][0][0]["content"]

'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).'

In [140]:
dataset["messages"][0][1]["content"]

'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migrate over long distances in response to injury (Imitola et al., 2004) and they migrate from specific stem-cell locations (e.g.

In [141]:
from functools import partial
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

encode_function = partial(
    encode_with_messages_format,
    tokenizer=tokenizer,
    max_seq_length=2048,
    add_bos=False,
)

lm_dataset = dataset.map(
    encode_function,
    batched=False,
    num_proc=1,
    load_from_cache_file=False,
    remove_columns=[
        name
        for name in dataset.column_names
        if name not in ["input_ids", "labels", "attention_mask"]
    ],
    desc="Tokenizing and reformatting instruction data",
)
lm_dataset.set_format(type="pt")
lm_dataset = lm_dataset.filter(lambda example: (example["labels"] != -100).any())

train_dataloader = DataLoader(
    lm_dataset,
    shuffle=False,
    collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=olmo, padding="longest"),
    batch_size=1,
)

Tokenizing and reformatting instruction data:   0%|          | 0/988 [00:00<?, ? examples/s]

Filter:   0%|          | 0/988 [00:00<?, ? examples/s]

In [142]:
print(len(train_dataloader))

988


In [149]:
def get_gradients(batch):
    gradients = {}
    
    olmo.zero_grad()

    output = olmo(**batch, use_cache=False)
    loss = output.loss
    
    loss.backward()
    
    for name, param in olmo.named_parameters():
        if param.grad is not None:
            gradients[name] = param.grad.clone().detach()
            
    return gradients

---
# gradient checking - start

In [150]:
training_sample_0 = list(train_dataloader)[0]
training_sample_1 = list(train_dataloader)[1]

In [151]:
# first and foremost, check if two samples are different after tokenizing
if training_sample_0["input_ids"].equal(training_sample_1["input_ids"]): 
    print("Tokenized inputs are the same. Check tokenizing functionality!")
else:
    print("As expected, tokenized inputs are not the same. ")

As expected, tokenized inputs are not the same. 


In [152]:
# check idempotency of some input samples
gradients_sample_0 = get_gradients(training_sample_0)
gradients_sample_1 = get_gradients(training_sample_1)
gradients_sample_0_later = get_gradients(training_sample_0)

# gradient dictionary keys of sample_0, sample_0_later and sample_1 should be the same
assert gradients_sample_0.keys() == gradients_sample_0_later.keys() == gradients_sample_1.keys()

In [153]:
# compare gradient of the same sample
assert gradients_sample_0.keys() == gradients_sample_0_later.keys(), "Gradient dictionaries must have same keys."

for key in gradients_sample_0.keys():
    assert gradients_sample_0[key].equal(gradients_sample_0_later[key]), f"Gradient '{key}' not equal!"
    
print("Gradients are equal when using the same sample.")

Gradients are equal when using the same sample.


In [154]:
# compare gradients of two different samples
assert gradients_sample_0.keys() == gradients_sample_1.keys(), "Gradient dictionaries must have same keys."

for key in gradients_sample_0.keys():
    assert not gradients_sample_0[key].equal(gradients_sample_1[key]), f"Gradient '{key}' equal!"
    
print("Gradients are different when using two different samples.")

Gradients are different when using two different samples.


In [170]:
gradients_sample_0[list(gradients_sample_0.keys())[0]].sum(1).dot(gradients_sample_1[list(gradients_sample_1.keys())[0]].sum(1))

tensor(-1.9074e-16)

In [None]:
del training_sample_0
del training_sample_1

del gradients_sample_0
del gradients_sample_1
del gradients_sample_0_later

# gradient checking - end

---

In [161]:
gradients = {}

for batch in train_dataloader:
    olmo.zero_grad()
    
    output = olmo(**batch, use_cache=False)
    loss = output.loss
    print(loss)
    
    loss.backward()
    
    for name, param in olmo.named_parameters():
        if param.grad is not None:
            gradients[name] = param.grad.clone().detach()
    
    break

tensor(2.5254, grad_fn=<NllLossBackward0>)


In [162]:
gradients[list(gradients.keys())[0]].data

IndexError: list index out of range

In [6]:
input_text1 = "Can brain cells move? By movement I mean long distance migration (preferably within the brain only)."
input_text2 = "Is it possible for brain cells to migrate over long distances, specifically within the brain itself?"

In [7]:
inputs1 = tokenizer(input_text1, return_tensors='pt')
inputs2 = tokenizer(input_text2, return_tensors='pt')

In [None]:
inputs1 = {k: v.to(device).requires_grad_(True) for k, v in inputs1.items()}
inputs2 = {k: v.to(device).requires_grad_(True) for k, v in inputs2.items()}

In [None]:
output1 = olmo(**inputs1)
output2 = olmo(**inputs2)

In [None]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity