In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk

from config import hf_model_id, lima_filtered_paraphrased_dataset_path, get_dataset_config
import torch
import numpy as np
import random
import gc

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

[2024-11-06 11:53:18,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -c /tmp/tmp_mgs6e74/test.c -o /tmp/tmp_mgs6e74/test.o
INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat /tmp/tmp_mgs6e74/test.o -laio -o /tmp/tmp_mgs6e74/a.out
/home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include 

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
torch.cuda.empty_cache()
gc.collect()

45

In [4]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [5]:
dataset_config = get_dataset_config(model)
dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='messages', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_lenth=None, sanity_check=False, sanity_check_max_samples=100, batched=False, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
# remove the comment from the following line, if the model should be processed on a GPU
# model.to(device)

In [8]:
model.eval()

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [9]:
dataset = load_from_disk(lima_filtered_paraphrased_dataset_path)

In [10]:
dataset.column_names

['id', 'messages', 'paraphrased_messages']

In [11]:
processed_dataset = prepare_dataset(dataset=dataset, tokenizer=tokenizer, model=model)

Tokenizing and reformatting instruction data:   0%|          | 0/988 [00:00<?, ? examples/s]

Filter:   0%|          | 0/988 [00:00<?, ? examples/s]

In [12]:
list(processed_dataset)[0]["input_ids"].size()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


torch.Size([1, 363])

In [13]:
tokenizer.decode(list(processed_dataset)[0]["input_ids"][0])

'<|user|>\nCan brain cells move? By movement I mean long distance migration (preferably within the brain only).\n<|assistant|>\nThe question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells mi

In [14]:
# iterate through samples and calculate gradients of original data
for sample in processed_dataset:
    
    gradients = get_gradients(model, sample)
    flattened_gradients = get_flattened_weight_vector(gradients)
    
    break

In [15]:
flattened_gradients.size()

torch.Size([1176764416])

In [16]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [17]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)