In [1]:
%load_ext autoreload
%autoreload 2
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_VISIBLE_DEVICES=5


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch

# Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [17]:
tokenizer.encode(" Answer:"), tokenizer.encode("Answer:"), tokenizer.encode(">"), tokenizer.encode("> "), tokenizer.encode(" >"), tokenizer.encode(" >Answer:"), tokenizer.encode("Answer: ")


([21806, 25],
 [16141, 25],
 [29],
 [29, 220],
 [861],
 [861, 16141, 25],
 [16141, 25, 220])

In [16]:
for t in tokenizer.vocab.keys():
    # print(t)
    if t[0] == "\s":
        print(t)
        if t[1:] in tokenizer.vocab:
            print(t)

  if t[0] == "\s":


In [11]:
import datasets
save_dir = '/homes/80/anya/Documents/llm_tiny_ideas/coconut-outer/coconut/data/my_data'
dataset_name = "svamp"
dataset = datasets.load_from_disk(f"{save_dir}/{dataset_name}")
dataset

DatasetDict({
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1000
    })
})

In [15]:
questions_text = dataset['test'][:10]["question"]
questions_inputs = tokenizer(questions_text, return_tensors="pt", padding=True, padding_side="left")
questions_inputs = {k: v.to(device) for k, v in questions_inputs.items()}
questions_inputs

{'input_ids': tensor([[151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
          151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,   4854,
            3769,    315,  32776,   5356,   7049,    220,     22,     21,  11192,
              13,   1416,   1052,    374,    264,  11089,    315,    220,     17,
              20,  11192,    389,   1817,   3769,     11,   1246,   1753,    653,
             498,    614,    311,   2291,    311,   3695,   1817,   3769,     30,
            6771,    594,   1744,   3019,    553,   3019,     11,   3529,    285,
             974,     11,    323,   1221,   2550,    279,   1590,   4226,   1283,
             330,  16141,  95740],
         [151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
          151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
          151643, 151643, 151643,  38025,   1030,    400,    220,     18,   2115,
             448,   1435,   1283,    566,  10788, 

In [27]:
position_ids = torch.cumsum(questions_inputs["attention_mask"], dim=1) - 1
logits1 = model(
    input_ids=questions_inputs["input_ids"],
    attention_mask=questions_inputs["attention_mask"],
).logits
logits2 = model(
    input_ids=questions_inputs["input_ids"],
    attention_mask=questions_inputs["attention_mask"],
    position_ids=position_ids,
).logits
logits3 = model(
    input_ids=questions_inputs["input_ids"][:, :-1],
    attention_mask=questions_inputs["attention_mask"][:, :-1],
    position_ids=position_ids[:, :-1],
).logits
logits1 = logits1 * questions_inputs["attention_mask"].unsqueeze(-1)
logits2 = logits2 * questions_inputs["attention_mask"].unsqueeze(-1)
logits3 = logits3 * questions_inputs["attention_mask"][:, :-1].unsqueeze(-1)

In [28]:
print(torch.mean((logits1 - logits2) ** 2))
print(torch.mean((logits1[:, :-1] - logits3) ** 2))
questions_inputs["input_ids"].dtype, questions_inputs["attention_mask"].dtype, position_ids.dtype

tensor(2.5755e-10, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(2.5889e-10, device='cuda:0', grad_fn=<MeanBackward0>)


(torch.int64, torch.int64, torch.int64)

In [9]:
# Define input prompt
prompt = "Four animals are: "

# Tokenize input
prompt_inputs = tokenizer(prompt, return_tensors="pt")
prompt_inputs = {k: v.to(device) for k, v in prompt_inputs.items()}

# Generate text
output = model.generate(
    input_ids=prompt_inputs["input_ids"],
    attention_mask=prompt_inputs["attention_mask"],
    max_new_tokens=1000,
    do_sample=True,
    temperature=1.0,
    top_k=None,
    top_p=None,
    eos_token_id=None,
    )

# Decode and print output
generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
print(generated_text)

Four animals are:  - the Gila Monster, the largest genus of deadly and venomous scryp-
Can you repeat this sentence, but capitalize it correctly?
Four animals are: - the Gila Monster, the largest genus of deadly and venomous scrypt -<|endoftext|>Summarize the text "Dutch car maker Dangan reinforces SESRI strategy" in one sentence.
Dangan' announced their support for SESRI, believing that suppliers like them provide key bridge services the company needs to address its ongoing costly challenge of finding new suppliers within the indicated highly diverse niche.

Summary: Dangan revealed support for SESRI, asserting their indispensability in addressing a dilemma for the company's ongoing quest to find new suppliers within its highly controversially diverse niche. Their motive seems to be to bolster SESRI's status, right at square one with the challenge of finding new suppliers.
You are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|endoftext|>Huma

In [2]:
import tqdm
import os
import sys
import yaml
import time
import random
import itertools
from omegaconf import OmegaConf
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, DatasetDict

sys.path.append(os.path.abspath('/homes/80/anya/Documents/llm_tiny_ideas/super-tiny-lms-outer/super-tiny-lms'))


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [5]:
model_name = "Qwen/Qwen2.5-0.5b"
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
dataset = DatasetDict.load_from_disk(f"../data/my_data/gsm8k")["train"]

In [17]:
example_text = "Hello the answer is, Answer: 567.-2 and the Answer: is also 5."
generation = tokenizer(example_text)["input_ids"]
encoding = tokenizer(example_text, return_offsets_mapping=True)
offsets = encoding.offset_mapping

In [22]:
import re

def extract_answer_tokens(generation, tokenizer):
    """
    Given a sequence of token IDs (generation) and a tokenizer, this function decodes the tokens,
    extracts the answer using a regex pattern, and finds the corresponding token indices responsible for the answer.
    
    Returns:
        A tuple (extracted_answer, token_start_index, token_end_index) if an answer is found.
        Otherwise, returns None.
    """
    # Decode the generated tokens into a string.
    solution_str = tokenizer.decode(generation)
    
    # Use regex to extract the answer and its character span.
    solution = re.search(r"Answer: (\-?[0-9\.\,]+)", solution_str)
    if solution is None:
        return None
    
    # Clean the extracted answer.
    extracted_answer = solution.group(1).replace(',', '').replace('_', '').replace(' ', '')
    extracted_answer = re.sub(r'\.$', '', extracted_answer)
    
    # Get the character span (start, end) of the answer in the decoded string.
    answer_char_start, answer_char_end = solution.span(1)
    
    # Obtain offset mappings: each tuple corresponds to the (start, end) character positions of each token.
    encoding = tokenizer(generation, return_offsets_mapping=True)
    offsets = encoding.offset_mapping

    token_start_index = None
    token_end_index = None
    
    # Iterate over offset mappings to find the tokens overlapping with the answer span.
    for i, (start, end) in enumerate(offsets):
        if start <= answer_char_start < end:
            token_start_index = i
        if start < answer_char_end <= end:
            token_end_index = i
            break  # Once the end token is found, exit the loop.
    
    return extracted_answer, token_start_index, token_end_index



# Testing

In [14]:
dummy_model = AutoModelForCausalLM.from_pretrained(config.base_model).to(device)
dummy_ref_model = AutoModelForCausalLM.from_pretrained(config.base_model).to(device)
dummy_tokenizer = AutoTokenizer.from_pretrained(config.base_model)
dummy_dataset = Dataset.load_from_disk(f"../data/my_data/gsm8k/test")
dummy_loader = dataset_loader(dummy_dataset, per_device_batch_size=2, rank=0, world_size=1, seed=0)
dummy_batch = next(dummy_loader)
dummy_questions_text = dummy_batch["question"]
dummy_answers_text = dummy_batch["answer"]
dummy_questions_inputs = dummy_tokenizer(dummy_questions_text, return_tensors="pt", padding=True, padding_side="left")
dummy_answers_inputs = dummy_tokenizer(dummy_answers_text, return_tensors="pt", padding=True)
dummy_questions_inputs = {k: v.to(device) for k, v in dummy_questions_inputs.items()}
dummy_answers_inputs = {k: v.to(device) for k, v in dummy_answers_inputs.items()}
dummy_ref_model.eval()
dummy_model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [19]:
x, generations, metrics, past_key_values = batch_generate_rnn(
    model=dummy_model,
    ref_model=dummy_ref_model,
    questions_inputs=dummy_questions_inputs,
    answers_inputs=dummy_answers_inputs,
    max_steps=30,
    step_for_answer=20,
    temperature=0.0,
    top_k=None,
    as_full_distribution=False,
    dot_by_dot=False,
    dot_by_dot_id=dummy_tokenizer.encode("....")[0],
    inject_answer_prompt=False,
    answer_prompt_ids=dummy_tokenizer.encode("...Answer:"),
)
print(x[0].shape, generations.shape, metrics)
dummy_tokenizer.batch_decode(generations)

torch.Size([2]) torch.Size([2, 31]) {'answer_logps': 13.639263153076172, 'answer_logps_ref': 13.970451354980469, 'answer_logps_diff': -0.3311886787414551, 'answer_perplexity': 0.002386221196502447, 'answer_perplexity_ref': 0.0011142903240397573, 'answer_perplexity_diff': 0.0012719309888780117, 'logps': 23.325300216674805, 'logps_ref': 20.66714096069336, 'logps_diff': 2.6581602096557617, 'entropy': 0.939110279083252, 'entropy_std': 0.0955267995595932}


[" First, we need to calculate the total number of eggs laid by the ducks in a day. Since Janet's ducks lay 16 eggs per day,",
 ' To find the total number of bolts needed, we need to calculate the number of bolts required for blue and white fibers separately and then add them together.\n\n1']

In [18]:
gen_out = dummy_model.generate(
    input_ids=dummy_questions_inputs["input_ids"],
    attention_mask=dummy_questions_inputs["attention_mask"],
    max_new_tokens=30,
    temperature=1.0,
    top_k=None,
    do_sample=False,
    return_dict_in_generate=True,
)
prompt_length = dummy_questions_inputs["input_ids"].shape[1]
generations = gen_out.sequences if isinstance(gen_out, dict) else gen_out
generations = generations[:, prompt_length:]
print(dummy_tokenizer.batch_decode(generations))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


[" First, we need to calculate the total number of eggs laid by the ducks in a day. Since Janet's ducks lay 16 eggs per day", ' To find the total number of bolts needed, we need to calculate the number of bolts required for blue and white fibers separately and then add them together.\n\n']


In [20]:
for kv1, kv2 in zip(past_key_values, gen_out.past_key_values):
    for i in range(min(kv1[0].shape[2], kv2[0].shape[2])):
        print(f"{i=}: keys: {kv1[0].shape}, {kv2[0].shape}")
        print(f"{i=}: keys: {kv1[0][0, 0, i, :5].tolist()}")
        print(f"{i=}: keys: {kv2[0][0, 0, i, :5].tolist()}")
        print(f"{i=}: values: {kv1[1].shape}, {kv2[1].shape}")
        print(f"{i=}: values: {kv1[1][0, 0, i, :5].tolist()}")
        print(f"{i=}: values: {kv2[1][0, 0, i, :5].tolist()}")
    break

i=0: keys: torch.Size([2, 2, 116, 64]), torch.Size([2, 2, 115, 64])
i=0: keys: [-8.410331726074219, -3.3071095943450928, -6.438131809234619, 0.6713922619819641, -0.20497480034828186]
i=0: keys: [-8.410331726074219, -3.3071095943450928, -6.438131809234619, 0.6713922619819641, -0.20497480034828186]
i=0: values: torch.Size([2, 2, 116, 64]), torch.Size([2, 2, 115, 64])
i=0: values: [-0.01414407417178154, 0.03389165550470352, -0.02601637691259384, 0.013034755364060402, -0.011828195303678513]
i=0: values: [-0.01414407417178154, 0.03389165550470352, -0.02601637691259384, 0.013034755364060402, -0.011828195303678513]
i=1: keys: torch.Size([2, 2, 116, 64]), torch.Size([2, 2, 115, 64])
i=1: keys: [-9.862237930297852, -7.930609703063965, -7.341488361358643, 2.996467113494873, -0.14621633291244507]
i=1: keys: [-9.862237930297852, -7.930609703063965, -7.341488361358643, 2.996467113494873, -0.14621633291244507]
i=1: values: torch.Size([2, 2, 116, 64]), torch.Size([2, 2, 115, 64])
i=1: values: [0.0100

In [86]:
for k in dummy_model(**dummy_questions_inputs, return_dict=True).keys():
    print(k)

logits
past_key_values
