## Run model and collect predictions on BABILong

In [1]:
import os
os.chdir('..')
os.environ['CUDA_VISIBLE_DEVICES'] = '6'

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
from tqdm.auto import tqdm
import pandas as pd
import time
import json

from pathlib import Path

from babilong.prompts import DEFAULT_PROMPTS, DEFAULT_TEMPLATE, get_formatted_input
from babilong.babilong_utils import compare_answers

# from modeling_amt.language_modeling import *
from modeling_amt.experimental import *

from babilong.babilong_utils import MultiTaskDataset, SentenceSampler, NoiseInjectionDataset


In [6]:
class Holder:
    def __init__(self) -> None:
        pass

args = Holder()
args.task_dataset = "qa1_single-supporting-fact"
# args.max_n_facts = 50
args.vary_n_segments = False
args.max_n_segments = 128
args.segment_size = 1024
args.sample_size = 512
args.segment_alignment = 'right'

base_model_path = "/home/jovyan/kuratov/models/Llama-3.2-1B-Instruct/"
dtype = torch.bfloat16
device = 'auto'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True,
                                             device_map=device, 
                                             torch_dtype=dtype,
                                             use_cache=False,
                                             attn_implementation='flash_attention_2')
model = model.eval()

In [22]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
    )
model = get_peft_model(model, peft_config)


In [23]:
mem_cell_args = dict(
    base_model=model.cpu(),
    num_mem_tokens=16,
)
mem_cell_args['d_mem'] = 64
mem_cell_args['wrap_pos'] = False
mem_cell_args['correction'] = False
mem_cell_args['layers_attr'] = "base_model.base_model.layers"

cell = AssociativeMemoryCell(**mem_cell_args)
model = AssociativeRecurrentWrapper(cell, 
# model = AssociativeRecurrentWrapperDisableLoRALS(cell, 
                                segment_size=args.segment_size,
                                max_n_segments=args.max_n_segments, 
                                vary_n_segments=args.vary_n_segments,
                                segment_alignment='right',
                                k2=-1,
                                return_all_logits=False,
)
model = model.to(torch.bfloat16)  # Ensure all layers use bfloat16


In [24]:
# ls /home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_4x1024_mem16_bs64_bptt--1_from_cpt_2-4_lora_ct-v3/run_1

In [25]:
# ls /home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_8x1024_mem16_bs64_bptt--1_from_cpt_4-8_lora_ct-v3/run_1/checkpoint-30000/

In [26]:
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_1x1024_mem16_bs64_bptt--1_from_cpt_0-1_lora/run_1/checkpoint-7500/pytorch_model.bin"
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_2x1024_mem16_bs64_bptt--1_from_cpt_1-2_lora/run_1/checkpoint-8000/pytorch_model.bin"
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_4x1024_mem16_bs64_bptt--1_from_cpt_2-4_lora/run_1/checkpoint-8000/pytorch_model.bin"
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_4x1024_mem16_bs64_bptt--1_from_cpt_2-4_lora_ct/run_1/checkpoint-5000/pytorch_model.bin"
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_4x1024_mem16_bs64_bptt--1_from_cpt_2-4_lora_ct-v2/run_1/checkpoint-2500/pytorch_model.bin"
# cpt_path = "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_linear_adamw_wd1e-03_2x1024_mem16_bs64_bptt--1_from_cpt_1-2_lora_ct-v3/run_1/checkpoint-15500/pytorch_model.bin"

checkpoints = [
    # "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_1x1024_mem16_bs64_bptt--1_from_cpt_0-1_lora_ct-v3/run_1/checkpoint-6500/pytorch_model.bin",
    # "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_2x1024_mem16_bs64_bptt--1_from_cpt_1-2_lora_ct-v3/run_1/checkpoint-28000/pytorch_model.bin",
    # "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_4x1024_mem16_bs64_bptt--1_from_cpt_2-4_lora_ct-v3/run_1/checkpoint-24500/pytorch_model.bin",
    # "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_8x1024_mem16_bs64_bptt--1_from_cpt_4-8_lora_ct-v3/run_1/checkpoint-30000/pytorch_model.bin",
    # "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_16x1024_mem16_bs64_bptt--1_from_cpt_8-16_lora_ct-v3/run_1/checkpoint-6000/pytorch_model.bin",
    "/home/jovyan/rmt/runs/test/babilong_multitask/meta-llama/Llama-3.2-1B-Instruct/lr_3e-04_d64_cosine_adamw_wd1e-03_2x1024_mem16_bs64_bptt--1_from_cpt_0-2_lora_ct-v3-rlls/run_1/checkpoint-29000/pytorch_model.bin" \
]

# eval_model_template = 'armt-llama3.2-1b-{}-ct-v3-retrain-align_right-canonic'
eval_model_template = 'test/armt-llama3.2-1b-{}-ct-v3-rlls'

dataset_name = "RMT-team/babilong"
results_folder = "/home/jovyan/rmt/babilong/babilong_evals/"

In [27]:
generate_kwargs = {
    'max_new_tokens': 30,
    'max_length': None,
    'num_beams': 1,
    'do_sample': False,
    'temperature': None,
    'top_p': None,
    'top_k': None,
    'pad_token_id': tokenizer.pad_token_id
}

if generate_kwargs['pad_token_id'] is None:
    generate_kwargs['pad_token_id'] = tokenizer.eos_token_id

# print(f'prompt template:\n{DEFAULT_TEMPLATE}')

In [28]:
# n_segments = 4


# sample_size = 1024 * n_segments

# noise_dataset = datasets.load_dataset('pg19')
# noise_dataset_test = noise_dataset['test']

# task_datasets = args.task_dataset.split(';')
# babi_path = '/home/jovyan/rmt/babilong/data/tasks_1-20_v1-2/en-10k'
# test_paths = [os.path.join(babi_path, f"{td}_test.txt") for td in task_datasets]

# args.max_n_facts = 50
# task_dataset_test = MultiTaskDataset(test_paths, max_n_facts=args.max_n_facts)

# # background text
# qa_margin = 70          # leave space for questions and answers
# test_sample_size = sample_size - qa_margin
# noise_sampler_test = SentenceSampler(noise_dataset_test, tokenizer=tokenizer, max_sentence_len=None, shuffle=True, random_seed=42)

# test_dataset = NoiseInjectionDataset(task_dataset=task_dataset_test,
#                                         noise_sampler=noise_sampler_test,
#                                         tokenizer=tokenizer,
#                                         sample_size=test_sample_size,
#                                         # mixed_length_ratio=args.mixed_length_ratio,
#                                         # task_start_pct=args.task_start_pct,
#                                         # task_end_pct=args.task_end_pct
#                                         )

In [29]:
from torch.nn.utils.rnn import pad_sequence
id_pad_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
gen_token = tokenizer.encode('GEN')[0]
eos_token = tokenizer.eos_token_id

def get_input_ids(sample):
    template = "{} {}Answer with a single word."
    context = tokenizer.decode(sample['input_tokens'])
    messages = [
        {"role": "user", "content": template.format(context, sample['question'])},
        {"role": "assistant", "content": sample['answer']}
    ]
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=False
    )
    input_ids_short = tokenizer.apply_chat_template(
        [{"role": "user", "content": template.format(context, sample['question'])}],
        tokenize=True,
        add_generation_prompt=False
    )
    labels_mask = torch.zeros(len(input_ids))
    labels_mask[len(input_ids_short) + 1:] = True
    return input_ids, labels_mask

def collate_fn(batch):
    inputs = [get_input_ids(sample) for sample in batch]
    input_ids = [torch.tensor(i[0]) for i in inputs]
    labels_mask = [torch.tensor(i[1]) for i in inputs]
    attention_mask = [torch.ones_like(b, dtype=bool) for b in input_ids]

    input_ids = pad_sequence(input_ids, padding_value=id_pad_value, batch_first=True)
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True)
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True)

    collated = {}
    collated['input_ids'] = collated['labels'] = input_ids
    collated['labels_mask'] = labels_mask.bool()
    collated['attention_mask'] = attention_mask.bool()
    
    return collated

In [30]:
tasks = "qa1_single-supporting-fact;qa2_two-supporting-facts;qa3_three-supporting-facts;qa4_two-arg-relations;qa5_three-arg-relations".split(';')
split_names = ['0k', '1k', '2k', '4k', '8k', '16k']#, '32k']

use_chat_template = True
use_instruction = False
use_post_prompt = False
use_examples = False

for cpt_path in checkpoints[::-1]:
    eval_model_name = eval_model_template.format(cpt_path.split('adamw_wd1e-03_')[1].split('_')[0])
    print('Evaluating ', eval_model_name)

    with open(cpt_path, 'rb') as cpt:
        weights = torch.load(cpt)

    model.load_state_dict(weights)
    model.cuda()


    device = 'cuda:0'

    for task in tqdm(tasks, desc='tasks'):
        # configure the prompt
        prompt_cfg = {
            'instruction': DEFAULT_PROMPTS[task]['instruction'] if use_instruction else '',
            'examples': DEFAULT_PROMPTS[task]['examples'] if use_examples else '',
            'post_prompt': DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else '',
            'template': DEFAULT_TEMPLATE,
            'chat_template': use_chat_template,
        }
        prompt_name = [f'{k}_yes' if prompt_cfg[k] else f'{k}_no' for k in prompt_cfg if k != 'template']
        prompt_name = '_'.join(prompt_name)

        for split_name in tqdm(split_names, desc='lengths'):
            n_segments = int(split_name[:-1])
            n_segments = max({n_segments, 1})
            # load dataset

            sample_size = 1024 * n_segments

            noise_dataset = datasets.load_dataset('pg19')
            noise_dataset_test = noise_dataset['test']

            task_datasets = args.task_dataset.split(';')
            babi_path = '/home/jovyan/rmt/babilong/data/tasks_1-20_v1-2/en-10k'
            test_paths = [os.path.join(babi_path, f"{td}_test.txt") for td in task_datasets]

            args.max_n_facts = 50 * n_segments
            task_dataset_test = MultiTaskDataset(test_paths, max_n_facts=args.max_n_facts)

            # background text
            qa_margin = 70          # leave space for questions and answers
            test_sample_size = sample_size - qa_margin
            noise_sampler_test = SentenceSampler(noise_dataset_test, tokenizer=tokenizer, max_sentence_len=None, shuffle=True, random_seed=42)

            test_dataset = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                                    noise_sampler=noise_sampler_test,
                                                    tokenizer=tokenizer,
                                                    sample_size=test_sample_size,
                                                    # mixed_length_ratio=args.mixed_length_ratio,
                                                    # task_start_pct=args.task_start_pct,
                                                    # task_end_pct=args.task_end_pct
                                                    )

            # Prepare files with predictions, prompt, and generation configurations
            outfile = Path(f'{results_folder}/{eval_model_name}/{task}_{split_name}_{prompt_name}.csv')
            outfile.parent.mkdir(parents=True, exist_ok=True)
            cfg_file = f'{results_folder}/{eval_model_name}/{task}_{split_name}_{prompt_name}.json'
            json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

            df = pd.DataFrame({'target': [], 'output': [], 'question': []})

            for sample in tqdm(test_dataset, desc=f'task: {task} length: {split_name}'):
                # target = sample['target']
                # context = sample['input']
                # question = sample['question']

                # # format input text
                # input_text = get_formatted_input(context, question, prompt_cfg['examples'],
                #                                     prompt_cfg['instruction'], prompt_cfg['post_prompt'],
                #                                     template=prompt_cfg['template'])

                # if use_chat_template:
                #     input_text = [{'role': 'user', 'content': input_text}]
                #     model_inputs = tokenizer.apply_chat_template(input_text, add_generation_prompt=True,
                #                                                     return_tensors='pt').to(device)
                #     model_inputs = {'input_ids': model_inputs}
                # else:
                #     model_inputs = tokenizer(input_text, return_tensors='pt',
                #                                 add_special_tokens=True).to(device)
                
                model_inputs = collate_fn([sample])
                for k in model_inputs:
                    model_inputs[k] = model_inputs[k].to(device)

                sample_length = model_inputs['input_ids'].shape[1]
                # with torch.cuda.amp.autocast():
                with torch.no_grad():
                    attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
                    # output = model.generate(**model_inputs, **generate_kwargs)
                    output = model(**model_inputs)
                    # we need to reset memory states between samples for activation-beacon models
                    # if 'activation-beacon' in model.name_or_path and hasattr(model, 'memory'):
                    #     model.memory.reset()
                    # 1/0

                # output = output[0]#[sample_length:]
                # output = tokenizer.decode(output, skip_special_tokens=True).strip()
                output = tokenizer.decode(output.logits[0].argmax(dim=-1)[model_inputs['labels_mask'][0]][2:-2])
                # 1/0

                df.loc[len(df)] = [sample['answer'], output, sample['question']]

                if df.shape[0] >= 7:
                    1/0
                    break
            # write results to csv file
            # df.to_csv(outfile)

Evaluating  test/armt-llama3.2-1b-2x1024-ct-v3-rlls


tasks:   0%|          | 0/5 [00:00<?, ?it/s]

lengths:   0%|          | 0/6 [00:00<?, ?it/s]

Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/datasets_modules/datasets/pg19/fb74320038a3c19e3cc87375222fc75ed3c8dc5a739b3e8dc835736388a7a882 (last modified on Wed Jan 17 14:07:48 2024) since it couldn't be found locally at pg19, or remotely on the Hugging Face Hub.


task: qa1_single-supporting-fact length: 0k:   0%|          | 0/999 [00:00<?, ?it/s]

  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]


ZeroDivisionError: division by zero

In [31]:
df

Unnamed: 0,target,output,question
0,hallway,hallway,Where is John?
1,bathroom,bathroom,Where is Mary?
2,kitchen,kitchen,Where is Sandra?
3,hallway,hallway,Where is Sandra?
4,kitchen,kitchen,Where is Sandra?
5,hallway,hallway,Where is Sandra?
6,garden,garden,Where is Sandra?


In [None]:
df

Unnamed: 0,target,output,question
0,hallway,Theway,Where is John?
1,bathroom,"""ottle",Where is Mary?
2,kitchen,homeitchen,Where is Sandra?
3,hallway,hallway,Where is Sandra?
4,kitchen,Thereitchen,Where is Sandra?
5,hallway,Hallway,Where is Sandra?
6,garden,Missingrief,Where is Sandra?


In [60]:
model_inputs

{'input_ids': tensor([[128000, 128006,   9125,  ...,  43341,   3195, 128009]],
        device='cuda:0'),
 'labels': tensor([[128000, 128006,   9125,  ...,  43341,   3195, 128009]],
        device='cuda:0'),
 'labels_mask': tensor([[False, False, False,  ...,  True,  True,  True]], device='cuda:0'),
 'attention_mask': tensor([[True, True, True,  ..., True, True, True]], device='cuda:0')}

'hallway<|eot_id|><|eot_id|>'

In [38]:
model(**model_inputs)

RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::BFloat16

### Interpret

In [158]:
use_chat_template = True
use_instruction = False
use_post_prompt = False
use_examples = False

cpt_path = checkpoints[-1]
task = 'qa1'

split_name = '4k'

model.rmt_config['segment_alignment'] = 'right'
eval_model_name = eval_model_template.format(cpt_path.split('adamw_wd1e-03_')[1].split('_')[0])
print('Evaluating ', eval_model_name)

with open(cpt_path, 'rb') as cpt:
    weights = torch.load(cpt)

model.load_state_dict(weights)
model.cuda()
model = model.eval()

device = 'cuda:0'
n_seg = int(split_name[:-1])
# load dataset

sample_size = 1024 * n_segments

noise_dataset = datasets.load_dataset('pg19')
noise_dataset_test = noise_dataset['test']

task_datasets = args.task_dataset.split(';')
babi_path = '/home/jovyan/rmt/babilong/data/tasks_1-20_v1-2/en-10k'
test_paths = [os.path.join(babi_path, f"{td}_test.txt") for td in task_datasets]

args.max_n_facts = 50
task_dataset_test = MultiTaskDataset(test_paths, max_n_facts=args.max_n_facts)

# background text
qa_margin = 70          # leave space for questions and answers
test_sample_size = sample_size - qa_margin
noise_sampler_test = SentenceSampler(noise_dataset_test, tokenizer=tokenizer, max_sentence_len=None, shuffle=True, random_seed=42)

test_dataset = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                        noise_sampler=noise_sampler_test,
                                        tokenizer=tokenizer,
                                        sample_size=test_sample_size,
                                        # mixed_length_ratio=args.mixed_length_ratio,
                                        # task_start_pct=args.task_start_pct,
                                        # task_end_pct=args.task_end_pct
                                        )

# Prepare files with predictions, prompt, and generation configurations
outfile = Path(f'{results_folder}/{eval_model_name}/{task}_{split_name}_{prompt_name}.csv')
outfile.parent.mkdir(parents=True, exist_ok=True)
cfg_file = f'{results_folder}/{eval_model_name}/{task}_{split_name}_{prompt_name}.json'
json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

df = pd.DataFrame({'target': [], 'output': [], 'question': []})

inputs = []
for sample in tqdm(test_dataset, desc=f'task: {task} length: {split_name}'):
        
        model_inputs = collate_fn([sample])
        for k in model_inputs:
            model_inputs[k] = model_inputs[k].to(device)

        sample_length = model_inputs['input_ids'].shape[1]
        with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
            with torch.no_grad():
                attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
                # output = model.generate(**model_inputs, **generate_kwargs)
                output = model(**model_inputs)
                # we need to reset memory states between samples for activation-beacon models
                # if 'activation-beacon' in model.name_or_path and hasattr(model, 'memory'):
                #     model.memory.reset()
                # 1/0

        # output = output[0]#[sample_length:]
        # output = tokenizer.decode(output, skip_special_tokens=True).strip()
        output = tokenizer.decode(output.logits[0].argmax(dim=-1)[model_inputs['labels_mask'][0]][2:-2])
        # 1/0

        df.loc[len(df)] = [sample['answer'], output, sample['question']]

        if df.shape[0] >= 20:
             break

Evaluating  rmt-llama3.2-1b-8x1024-ct-v3-retrain-align_right-canonic


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/datasets_modules/datasets/pg19/fb74320038a3c19e3cc87375222fc75ed3c8dc5a739b3e8dc835736388a7a882 (last modified on Wed Jan 17 14:07:48 2024) since it couldn't be found locally at pg19, or remotely on the Hugging Face Hub.


task: qa1 length: 4k:   0%|          | 0/999 [00:00<?, ?it/s]

  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in inputs]
  labels_mask = [torch.tensor(i[1]) for i in i

In [159]:
df

Unnamed: 0,target,output,question
0,hallway,hallway,Where is John?
1,bathroom,bathroom,Where is Mary?
2,kitchen,kitchen,Where is Sandra?
3,hallway,hallway,Where is Sandra?
4,kitchen,kitchen,Where is Sandra?
5,hallway,hallway,Where is Sandra?
6,garden,garden,Where is Sandra?
7,hallway,hallway,Where is Daniel?
8,office,office,Where is Sandra?
9,office,office,Where is Daniel?


In [88]:
model = model.eval()

In [95]:
model_inputs

{'input_ids': tensor([[128000, 128006,   9125,  ...,     70,   8506, 128009]],
        device='cuda:0'),
 'labels': tensor([[128000, 128006,   9125,  ...,     70,   8506, 128009]],
        device='cuda:0'),
 'labels_mask': tensor([[False, False, False,  ...,  True,  True,  True]], device='cuda:0'),
 'attention_mask': tensor([[True, True, True,  ..., True, True, True]], device='cuda:0')}

In [157]:
model = model.eval()

sample = test_dataset[6]
model_inputs = collate_fn([sample])
for k in model_inputs:
    model_inputs[k] = model_inputs[k].to(device)

sample_length = model_inputs['input_ids'].shape[1]
with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
    with torch.no_grad():
        attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
        output = model(**model_inputs)
tokenizer.decode(output.logits[0].argmax(dim=-1))[-1000:]

  labels_mask = [torch.tensor(i[1]) for i in inputs]


"I to The least beginning to to the, first of of the first totoridor of the theb of orAND, or none, or\nb,for the of of toto of, the first for of the\n The a single of of onin are out first to toing to to to the the to totoed of of small of to the first of I of to to the firstkon of or the the of,, theTC\n\n of The game of thethe firstated of not much to to most of that of and the firstkusion of of be a\n\n to and to to to\nthepeaknder the The game of been established from the beginning of of by the,\ninff of the same to of to the of been to the early of\nthe a in to to the first of The's a the a first ofof not by The, to the office. B the start of of be from be into table into the firstbOLLOWally of and single of been made by The of, the nothing ownin of of and of the to of to to ofep, and totoured by and the thousand the, of and\n of of to thethe of not by The, a made the, a small of thethe own k tohallhall. K is the's The: a simple word\n\n garden\n\n\n\n<|end_header_id|>\n\ngarden<

In [139]:
sample = test_dataset[6]
model_inputs = collate_fn([sample])

model = model.eval()
with torch.no_grad():
    with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
        output = model(**model_inputs)
tokenizer.decode(output.logits[0].argmax(dim=-1))[-1000:]


  labels_mask = [torch.tensor(i[1]) for i in inputs]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [83]:

# output = output[0]#[sample_length:]
# output = tokenizer.decode(output, skip_special_tokens=True).strip()
out_text = tokenizer.decode(output.logits[0].argmax(dim=-1)[model_inputs['labels_mask'][0]][2:-2])
# 1/0

In [78]:
 tokenizer.decode(output.logits[0].argmax(dim=-1)[model_inputs['labels_mask'][0]][2:-2])

'garden'

In [103]:
tokenizer.decode(output.logits[0].argmax(dim=-1))

"Tags\n\n\n\n\n\nIting back Space\n\n  2022\nI's: December0 of 2023\nLet<|start_header_id|>system<|start_header_id|>\n\nLet you want not togetherofgether of the ideas thingspect of of are the the same of the ofor of of are be a good of of the thousand of of<|eot_id|> it the may make be the bestpective of of I are no a toto the is make a of<|eot_id|> if is not to to in the own of or ofof the of to of<|eot_id|> own is a in and the a own out onon of on of the way to to well to in the of to<|eot_id|> best was of not and of be from idea of the own arein the on on<|eot_id|> are are of not of all to to and I beif the in to the a to<|eot_id|> one can to think the of are beenin made by<|eot_id|> of in not of same to to of the mostpective of ofto the case of<|eot_id|> is is is be be the in beede the the most beif to to the first to<|eot_id|> is be the out the the or to to<|eot_id|> you are a in with in the the of or are have a little toto to in of<|eot_id|> kind of be the and is in is be to be i

In [247]:
with torch.cuda.amp.autocast():
    with torch.no_grad():
        attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
        output = model.generate(**model_inputs, **generate_kwargs, attention_mask=attn_mask)
        # we need to reset memory states between samples for activation-beacon models
        # if 'activation-beacon' in model.name_or_path and hasattr(model, 'memory'):
        #     model.memory.reset()

output = output[0]#[sample_length:]
tokenizer.decode(output, skip_special_tokens=False)

'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [250]:
target

'kitchen'

In [248]:
with torch.cuda.amp.autocast():
    with torch.no_grad():
        attn_mask = torch.ones_like(model_inputs['input_ids'].bool().to(device))
        output = model(**model_inputs)#, attention_mask=attn_mask)
        # output = model.generate(**model_inputs, **generate_kwargs, attention_mask=attn_mask)5            output = model(**model_inputs, attention_mask=attn_mask)

In [249]:
tokenizer.decode(output.logits.argmax(dim=-1)[0][:])

"Tags\n\n\n\n\n\nIting back Space\n\n  2022\nI's: December0 of 2023\nLet<|start_header_id|>system<|start_header_id|>\n\nLet the timeI best of the two, the two of the the, theest, for twokustering of the most of of the two of of of the the of the beginningthe of of of of the world of to of to of of or of most ofof the, variety of of the two world of of the only the ofof word of the case of<|eot_id|>'s to the lab<|eot_id|><|eot_id|> and, you same of the is is and of thetheasesuriousbers of of is the the, or is same of of ofof the thousand of of of of from of the of the many of the,the of<|eot_id|> room of the which of the, not in in thethe of of the is a that to of of the first way ofof of the of of one of the one of to be a thousand tothere of1,<|eot_id|> if of the two to of the to ofof to are to the and the to of the the, the the first are and firstthe number to of the to to to make the on ininside the the first of of of to of to to, to are no to toto of1,<|eot_id|> is to the other<|eo

In [251]:
tokenizer.decode(output.logits.argmax(dim=-1)[0][-1024*3 -1:])

",Tags k havek this such kbedimateskkbedkkkkkbedkbedkkkbedilitykbed kbedbedkbedbed bedbedbedbedbedbedbed Kbedbedbedbedkkkbedbed arebedbedbedbed kbedbedbedk k k of k kbed k and to to the city, \n\nbedODbed K k k kkbededbedbed kbedbedkbedbed the bedbedbed and I arebedbed kbed of K bedk k and onbedbedbed thebeded ofbed K Kbedbed k thek k k kkbedbedbedbedbedbedbed k K kbed a k bed ofbedbed kbedbed k bed world ofbed k bedk ofbed the bedbed ofbedbedbedbed kbed the bedbedbedbedbed is a k of be bebedbed k of bedkbed of the Kbedbed the k of thebedbed k isbed ak of k I is notbed k k are thebed k ofbedbedbed k the k to the we bed k ofbedbed kbedbed of k on k K not a about the kbedkbed I thek K I bed k arebed k with to the bed of k the kk k of I bed k ofbed in of sidebed of andbed, thebedbed kbedbed and I arekplacementedbed arebed sincebed kbed kbed the of kbedbedbed of k K kbed k of the bed kkARD K k and k thebed arebedbed k ink K k of the kbedkk the k, bed of by k ofkbed bed ofbed k k of to k k 

In [239]:
output.loss

0

In [237]:
tokenizer.decode(output.logits.argmax(dim=-1)[0][-2049:])

"bedimportk k kk kbedbed bed bedbed bed the bedbedbedbed bybedbedbedbedbedbedbedbedbedbedbedbednowledbedbedbedbed ofbedbedbedbedbedbed of ofbedbedbedbedbed ed edk kbedkbedbed though bed bedbedbed bedbedbed kbedbedbedbedkbed bed of thebededbedbed k intobed k ofbed bedbedbedbed ofbed bedbedarden bed of thebedbedbedbed a k of kbedbed we bedbed bedbedbed bedbedbedbedbed bed kbedbedbedbedbedbedbedbedbedbedbedbedbedbedbedbedbedbedbed ofbedbedbedbedbed k bedbedbedbedbed kbedbedbedbed bybed kbedbedbed bedbed k ofbed bedbedbedbedkbed least kbed bedk k kbed bedbedbedbedbed kbedbed ofbedbed kbedbedbedbedbedbedbedbed and \n\n much bed k of arebedbedbedbedbedbedbedbedbed ofbedbed thebedbedbedbed bed k frombedbedbed bedbed of k k isbedbedbed edbedbed bedbedbedbedbeded of bedbedbed ofdbedbedbedbedbedbedbedbed bedbed bedbedbedbedbedbedbedbedbedbed to bedbedbed ofbedbedbedbedbedbedbedbedbedbedbedkbed the k kbedbed ofbed bed k frombedbed I kbed bedbed bedbedbedbed kbed k bed to tokbed ofbedbedbedbedbed 

In [238]:
tokenizer.decode(model_inputs['input_ids'][0][-2049:])

'\nwhich remind us of days far earlier. Edward the Fourth and Richard the\nThird were chosen Kings, or at least had their claims to the Crown\nacknowledged, by gatherings of the citizens of London which remind us\nof the wars of Stephen and Matilda(59). Still even in this age, the\npower of Parliament was advancing(60); the anxiety of every pretender\nto get a parliamentary sanction for his claims was a sign of the\ngrowing importance of Parliament, and we get incidental notices which\nshow that a seat in the House of Commons, and that not as a knight of a\nshire, but as a burgess of a borough, was now an object of ambition for\nmen of the class from which knights of the shire were chosen, and even\nfor the sons of members of the Upper House(61). At last came the sixteenth century, the time of trial for parliamentary\ninstitutions in so many countries of Europe. Daniel travelled to the kitchen. Not a few assemblies which\nhad once been as free as our own Parliament were, during that ag

In [179]:
context

'If all this effort and\nexpenditure had resulted in success, it would be possible to keep\nsilent and shrug one\'s shoulders; but when the mode of undertaking\nthis expedition can be clearly shown to have been the direct cause of\nits failure, silence would be a crime. When Lord Wolseley told the\nsoldiers at Korti on their return from Metemmah, "It was not _your_\nfault that Gordon has perished and Khartoum fallen," the positiveness\nof his assurance may have been derived from the inner conviction of\nhis own stupendous error. The expedition was finally sanctioned in August, and the news of its\ncoming was known to General Gordon in September, before, indeed, his\nown despatches of 31st July were received in London, and broke the\nsuspense of nearly half a year. He thought that only a small force was\ncoming, under the command of Major-General Earle, and he at once, as\nalready described, sent his steamers back to Shendy, there to await\nthe troops and convey them to Khartoum. He see

In [51]:
df

Unnamed: 0,target,output,question,input,n_segments,segments
0,bathroom,bathroom,Where is Mary?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
1,kitchen,kitchen,Where is Sandra?,<|begin_of_text|><|start_header_id|>system<|en...,3,<|begin_of_text|><|start_header_id|>system<|en...
2,kitchen,kitchen,Where is Mary?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
3,kitchen,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!,Where is John?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
4,bedroom,bedroom,Where is Sandra?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
5,office,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!,Where is John?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
6,garden,garden,Where is Mary?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
7,bathroom,bathroom,Where is Sandra?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
8,kitchen,kitchen,Where is Mary?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...
9,bedroom,bedroom,Where is John?,<|begin_of_text|><|start_header_id|>system<|en...,4,<|begin_of_text|><|start_header_id|>system<|en...


In [163]:
for i, row in df.iterrows():
    print(i, row.question, row.output, row.segments)
    print('\n\n\n\n')

0 Where is Mary?  bathroom <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

And didn't you ask for
us all to go?" "Certainly not--we're not sick," said Pauline, laughing. "Miranda says what Hilary needs is a good herb tonic!" "What is Uncle Paul going to do then?" "Send some money every month--to have good times with at home." "And _you_ don't call that _nice_! Well of all the ungratefullest
girls! Is it for us _all_ to have good times with? Patience fairly jumped up and down with excitement. "When will they
begin, and what will they be like? O Paul, just think of the good
times we've had _without_ any money 't all! They had reached the strawberry-bed and Patience dropped down in the
grass beside it, her hands clasped around her knees. "Good times in
Winton will be a lot better than good times anywhere else. Winton's
such a nice sociable place." Pauline settled

In [31]:
segments = model.segment(input_ids=model_inputs['input_ids'])

In [32]:
[s['input_ids'].shape for s in segments]

[torch.Size([1, 594]),
 torch.Size([1, 1024]),
 torch.Size([1, 1024]),
 torch.Size([1, 1024])]