In [1]:
from modelzipper.tutils import *
from pprint import pprint
import re
import datasets
import numpy as np
from modelzipper.tutils import *
import random
import itertools

  from .autonotebook import tqdm as notebook_tqdm


[4m[36mModelZipper is ready for launch🚀 | Current Version🦄 >>> 0.2.7 <<< | AOE Time🕒 2024-06-28 00:40:49[0m


In [None]:
fanoutQA_data = auto_read_data("/data/zecheng/Retrieval_Head/fanout-final-dev-fragments.json")
print(len(fanoutQA_data))

In [None]:
def extract_best_evidence(raw_reference):
    pattern = re.compile(r'<content>(.*?)</content>', re.DOTALL)
    match = pattern.search(raw_reference)
    return match.group(1)


def extract_all_evidence_iter(raw_reference):
    pattern = re.compile(r'<content>(.*?)</content>', re.DOTALL)
    matches = pattern.finditer(raw_reference)
    results = [match.group(1) for match in matches]
    return results


def return_multi_hop_evidence_groups(root):
    top_qa = {"answer": root['answer'], "categories": root['categories'], "question": root['question']}
    decomposed_qa = find_evidence_groups(root, max_depth=2)
    # second_hop = find_evidence_groups(root, max_depth=2)
    # third_hop = find_evidence_groups(root, max_depth=3)
    return {"top_qa": top_qa, "two_hop": decomposed_qa}
    

def find_evidence_groups(root, max_depth=2):
    result = []

    def dfs(node, depth):
        if depth == max_depth:
            candicate_evidence = find_evidence(node)
            evidence = [extract_best_evidence(item) for item in candicate_evidence]
            all_evidences = [extract_all_evidence_iter(item) for item in candicate_evidence]
            return [{
                "question": node["question"], 
                "answer": node["answer"], 
                "evidence": evidence[0], 
                "depth": depth,
                "all_evidences": all_evidences,
            }]
        
        else:
            results = []
            for child in node.get("decomposition", []):
                results.extend(dfs(child, depth + 1))
            return results

    def find_evidence(node):
        if node.get("sorted_fragments"):
            return [node["sorted_fragments"]]
        evidence = []
        for child in node.get("decomposition", []):
            evidence.extend(find_evidence(child))
        return evidence

    result.extend(dfs(root, 1))
    return result

testing_data = []
with tqdm(total=len(fanoutQA_data)) as pbar:
    for i, sample in enumerate(fanoutQA_data):
        testing_data.append(return_multi_hop_evidence_groups(sample))
        pbar.update(1)

print(f"processing finish, total cases {len(testing_data)}")
print(testing_data[0]['two_hop'][0]['all_evidences'][0][0])

### ============ testing case ============ ###
# CASE_ID = 3
# res = return_multi_hop_evidence_groups(fanoutQA_data[CASE_ID])
# print(len(res))
# pprint(res)

In [None]:
def create_answer(sample):
    answer_str = ""
    if isinstance(sample, list): 
        answer_str = ", ".join([str(i) for i in sample])
    if isinstance(sample, dict):
        for k, v in sample.items(): 
            answer_str += f"{k}: {v}, "
        answer_str = answer_str[:-2]  # remove last ","
    return answer_str

closed_book_testing_sets, open_book_testing_sets = [], []

for sample in testing_data:
    answer = create_answer(sample["top_qa"]["answer"])
    if len(answer) == 0: continue
    closed_book_testing_sets.append({
        "question": sample["top_qa"]["question"],
        "answer": answer,
    })
    all_evidence = [item["all_evidences"] for item in sample["two_hop"]]
    open_book_testing_sets.append({
        "question": sample["top_qa"]["question"],
        "answer": answer,
        "all_evidence": all_evidence,
    })

auto_save_data(closed_book_testing_sets, "/data/zecheng/Retrieval_Head/quick_eval/fanoutqa_data/closed_book_testing_sets.jsonl")
auto_save_data(open_book_testing_sets, "/data/zecheng/Retrieval_Head/quick_eval/fanoutqa_data/open_book_testing_sets.jsonl")

In [None]:
print(open_book_testing_sets[0].keys())
print(open_book_testing_sets[0]['question'])
print(open_book_testing_sets[0]['answer'])
open_book_testing_sets[0]['all_evidence'][0][0]

In [None]:
# create single data
def create_single_data_single_hop(sample):
    question, answer = sample['first_hop']['question'], sample['first_hop']['answer']
    answer_str = ""
    if isinstance(answer, list): 
        answer_str = ", ".join([str(i) for i in answer])
    if isinstance(answer, dict):
        for k, v in answer.items(): 
            answer_str += f"{k}: {v}, "
        answer_str = answer_str[:-2]  # remove last ","

    all_evidences = []
    
    for sec_hop in sample['second_hop']:
        hop_evidence = []
        for evi in sec_hop['all_evidences']:
            hop_evidence.extend(evi)
        hop_evidence_str = " [DOC] ".join(hop_evidence)
        all_evidences.append(hop_evidence_str)
    
    return {
        "question": question,
        "answer": answer_str,
        "evidence": all_evidences
    }

all_testing_sample = []
for sample in testing_data:
    all_testing_sample.append(create_single_data_single_hop(sample))
    
auto_save_data(all_testing_sample, "/data/zecheng/sunzc/LongBench-main/data/fanoutqa/fanoutqa_1hop.jsonl")


### process simpo training data offline

#### Preferred Data Format

wrap_batch["concatenated_input_ids"] = torch.tensor(batch[0]["concatenated_input_ids"])
wrap_batch["concatenated_attention_mask"] = torch.tensor(batch[0]["concatenated_attention_mask"])
wrap_batch["concatenated_labels"] = torch.tensor(batch[0]["concatenated_labels"])
wrap_batch["position_ids"] = torch.tensor(batch[0]["position_ids"])

### 文件测试

In [2]:
import torch

def create_position_ids(N, L):
    """sampling N points from L (max_chunk_size space)"""
    if N == L:
        start_pos = 0
    else:
        start_pos = np.random.randint(0, L - N)
    end_pos = start_pos + N
    position_ids = torch.arange(start_pos, end_pos)
    return position_ids

def create_covering_position_ids(N, L):
    """Create sets of position IDs to cover all positions from 0 to L-1 with intervals of length N."""
    if N > L:
        raise ValueError("N should not be greater than L")
    
    num_intervals = (L + N - 1) // N

    position_ids_list = []
    for i in range(num_intervals):
        start_pos = i * (L - N) // (num_intervals - 1) if num_intervals > 1 else 0
        end_pos = start_pos + N
        if end_pos > L:
            end_pos = L
            start_pos = L - N if L > N else 0
        position_ids = torch.arange(start_pos, end_pos)
        position_ids_list.append(position_ids)

    return position_ids_list

def auto_padding(t: torch.Tensor, length: int, filling_value=-100, return_attention_mask=False):
    if length < t.size(0):
        if return_attention_mask: 
            return t[:length]
        else: 
            return t[:length], torch.ones_like(t[:length])
    padded_tensor = torch.full((length,), filling_value, dtype=t.dtype)
    padded_tensor[:t.size(0)] = t
    if return_attention_mask:
        attention_mask = torch.zeros(length, dtype=torch.int)
        attention_mask[:t.size(0)] = 1
        return padded_tensor, attention_mask
    return padded_tensor


# 使用示例
N = 5
L = 12

position_ids_list = create_covering_position_ids(N, L)
print(position_ids_list)
# for idx, pos_ids in enumerate(position_ids_list):
#     print(f"Position IDs {idx+1}:", pos_ids)


[tensor([0, 1, 2, 3, 4]), tensor([3, 4, 5, 6, 7]), tensor([ 7,  8,  9, 10, 11])]


In [None]:
def create_random_position_ipt_data(tokenizer: AutoTokenizer, all_refs: List[str], combined_question: str, combined_answer: str, prefix_a: str, suffix_a: str, qa_size: int, max_embedding_size: int, real_reference_size: int):

    SYSTEM_SUFFIX = "Below is some references. Please read it carefully and answer the following question: "
    QUESTION_TEMPLATE = "Please answer the following question according to the references: {question}\n"
    ANSWER_TEMPLATE = "The answer is: {answer}\n"

    # Create System Suffix
    tok_suffix = tokenizer(SYSTEM_SUFFIX, return_tensors="pt")
    padded_tok_suffix, padded_suffix_attention_mask = tok_suffix.input_ids[0][:-1], tok_suffix.attention_mask[0][:-1]
    system_position_ids = torch.arange(0, padded_tok_suffix.size(-1))
    system_prompt_size = system_position_ids.size(-1)

    # Create Chunked References
    statistic_data_size = []
    real_max_chunk_size = real_reference_size // len(all_refs)
    tok_all_ref = [tokenizer(item, return_tensors="pt", add_special_tokens=False).input_ids[0] for item in all_refs]
    truncted_refer_tok_lst = []
    for item in tok_all_ref:
        statistic_data_size.append(item.size(-1))
        if item.size(-1) > real_max_chunk_size: 
            item = item[: real_max_chunk_size]
        truncted_refer_tok_lst.append(item)

    position_input_ids = []
    fake_position_chunk_size = (max_embedding_size - qa_size - system_prompt_size) // len(truncted_refer_tok_lst)
    positional_chunks = torch.arange(system_prompt_size, max_embedding_size - qa_size, fake_position_chunk_size)
    for i, item in enumerate(truncted_refer_tok_lst):
        chunk_ids = create_position_ids(item.size(-1), real_max_chunk_size)
        chunk_ids += positional_chunks[i]
        position_input_ids.append(chunk_ids)

    padded_input_attention_ids = [
        auto_padding(item, real_max_chunk_size, filling_value=0, return_attention_mask=True) 
        for item in position_input_ids
    ]
    padded_reference_ids = [item[0] for item in padded_input_attention_ids]
    padded_reference_ids = torch.concatenate(padded_reference_ids, dim=0)
    padded_reference_attention_mask = [item[1] for item in padded_input_attention_ids]
    padded_reference_attention_mask = torch.concatenate(padded_reference_attention_mask, dim=0)
    padded_position_ids = [auto_padding(item, real_max_chunk_size, filling_value=0) for item in position_input_ids]
    padded_reference_position_ids = torch.concatenate(padded_position_ids, dim=0)

    # Create Question
    question = QUESTION_TEMPLATE.format(question=combined_question)
    tok_question = tokenizer(question, return_tensors="pt", add_special_tokens=False).input_ids
    padded_tok_question, padded_question_attention_mask = auto_padding(tok_question[0], tok_question.size(-1), filling_value=0, return_attention_mask=True)
    question_position_input_ids = create_position_ids(tok_question.size(-1), qa_size) + max_embedding_size - qa_size 

    # Create Chosen / Rejected Answers / and their labels
    chosen_answer = ANSWER_TEMPLATE.format(answer=combined_answer)
    prefix_rejected_answer = ANSWER_TEMPLATE.format(answer=prefix_a)
    suffix_rejected_answer = ANSWER_TEMPLATE.format(answer=suffix_a)
    tok_chosen_answer = tokenizer(chosen_answer, return_tensors="pt").input_ids[0][1:]
    tok_prefix_rejected_answer = tokenizer(prefix_rejected_answer, return_tensors="pt").input_ids[0][1:]
    tok_suffix_rejected_answer = tokenizer(suffix_rejected_answer, return_tensors="pt").input_ids[0][1:]

    system_reference_question_size = max_embedding_size - qa_size + padded_question_attention_mask.size(-1)

    padded_tok_chosen_answer, padded_chosen_answer_attention_mask = auto_padding(
        tok_chosen_answer, qa_size - padded_question_attention_mask.size(-1), 
        filling_value=0, return_attention_mask=True
    )
    tok_chosen_answer_labels = auto_padding(tok_chosen_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    chosen_answer_position_ids = create_position_ids(tok_chosen_answer.size(-1), tok_chosen_answer.size(-1)) + system_reference_question_size
    chosen_answer_position_ids = auto_padding(chosen_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)

    padded_tok_prefix_rejected_answer, padded_prefix_rejected_answer_attention_mask = auto_padding(
        tok_prefix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=0, return_attention_mask=True)
    tok_prefix_rejected_answer_labels = auto_padding(tok_prefix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    prefix_rejected_answer_position_ids = create_position_ids(tok_prefix_rejected_answer.size(-1), tok_prefix_rejected_answer.size(-1)) + system_reference_question_size
    prefix_rejected_answer_position_ids = auto_padding(prefix_rejected_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)
    
    padded_tok_suffix_rejected_answer, padded_suffix_rejected_answer_attention_mask = auto_padding(
        tok_suffix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=0, return_attention_mask=True)
    tok_suffix_rejected_answer_labels = auto_padding(tok_suffix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    suffix_rejected_answer_position_ids = create_position_ids(tok_suffix_rejected_answer.size(-1), tok_suffix_rejected_answer.size(-1)) + system_reference_question_size
    suffix_rejected_answer_position_ids = auto_padding(
        suffix_rejected_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)

    # Merge All the Inputs Data
    concatenated_batch = {}
    concatenated_batch["input_ids"] = torch.concatenate([padded_tok_suffix, padded_reference_ids, padded_tok_question], dim=0)
    concatenated_batch["attention_mask"] = torch.concatenate([padded_suffix_attention_mask, padded_reference_attention_mask, padded_question_attention_mask], dim=0)
    concatenated_batch["position_ids"] = torch.concatenate([system_position_ids, padded_reference_position_ids, question_position_input_ids], dim=0)
    referece_question_length = concatenated_batch["attention_mask"].size(-1)
    referece_question_labels = torch.full((1, referece_question_length), -100)[0]
    
    # Create Labels for Each Part
    concatenated_batch["chosen_answer"] = {
        "input_ids": padded_tok_chosen_answer, 
        "attention_mask": padded_chosen_answer_attention_mask, 
        "labels": torch.concatenate([referece_question_labels, tok_chosen_answer_labels], dim=0),
        "position_ids": chosen_answer_position_ids
    }
    concatenated_batch["prefix_rejected_answer"] = {
        "input_ids": padded_tok_prefix_rejected_answer, 
        "attention_mask": padded_prefix_rejected_answer_attention_mask, 
        "labels": torch.concatenate([referece_question_labels, tok_prefix_rejected_answer_labels], dim=0),
        "position_ids": prefix_rejected_answer_position_ids,
    }
    concatenated_batch["suffix_rejected_answer"] = {
        "input_ids": padded_tok_suffix_rejected_answer, 
        "attention_mask": padded_suffix_rejected_answer_attention_mask, 
        "labels": torch.concatenate([referece_question_labels, tok_suffix_rejected_answer_labels], dim=0),
        "position_ids": suffix_rejected_answer_position_ids,
    }

    return concatenated_batch, sum(statistic_data_size)

In [3]:
def combine_fn(lst, max_candidates=2):
    trimmed_lists = [random.sample(sublst, min(len(sublst), max_candidates)) if len(sublst) > max_candidates else sublst for sublst in lst]
    all_combinations = itertools.product(*trimmed_lists)
    concatenated_results = [torch.cat(combination) for combination in all_combinations]
    return concatenated_results

def create_system_suffix(tokenizer, system_suffix, special_token_id: int=13):
    tok_suffix = tokenizer(system_suffix, return_tensors="pt", add_special_tokens=False)
    padded_tok_suffix, padded_suffix_attention_mask = tok_suffix.input_ids[0], tok_suffix.attention_mask[0]
    # add special token
    padded_tok_suffix = torch.concatenate([padded_tok_suffix, torch.tensor([special_token_id])], dim=0)
    padded_suffix_attention_mask = torch.concatenate([padded_suffix_attention_mask, torch.tensor([1])], dim=0)
    system_position_ids = torch.arange(0, padded_tok_suffix.size(-1))
    return padded_tok_suffix, padded_suffix_attention_mask, system_position_ids

def create_chunked_reference(tokenizer: AutoTokenizer, all_refs: List[str], real_reference_size: int, max_embedding_size: int, system_prompt_size: int, qa_size: int, special_token_id: int=13):
    real_max_chunk_size = real_reference_size // len(all_refs) - 1 # allocate one position for attention reallocation
    tok_all_ref = [tokenizer(item, return_tensors="pt", add_special_tokens=False).input_ids[0] for item in all_refs]
    truncted_refer_tok_lst, statistic_data_size = [], []
    for item in tok_all_ref:
        statistic_data_size.append(item.size(-1))
        if item.size(-1) > real_max_chunk_size: 
            item = item[: real_max_chunk_size]
        truncted_refer_tok_lst.append(item)

    fake_position_chunk_size = real_max_chunk_size + 1  # with last special token index for each chunk
    positional_chunks = torch.arange(system_prompt_size, max_embedding_size - qa_size, fake_position_chunk_size)
    # Here, end_positional_chunks denotes special token ids
    begin_positional_chunks, end_positional_chunks = positional_chunks[:-1], positional_chunks[1:] - 1  
    all_chunk_pos_lst = []
    
    for i, item in enumerate(truncted_refer_tok_lst):
        chunk_token_pos_lst = create_covering_position_ids(item.size(-1), real_max_chunk_size)
        chunk_token_pos_lst = [item + begin_positional_chunks[i] for item in chunk_token_pos_lst]
        all_chunk_pos_lst.append(chunk_token_pos_lst)

    padded_chunk_pos_lst = [[auto_padding(sub_item, real_max_chunk_size, filling_value=0, return_attention_mask=False) for sub_item in item] for item in all_chunk_pos_lst]
    padded_refer_tok_lst = [auto_padding(item, real_max_chunk_size, filling_value=0, return_attention_mask=True) for item in truncted_refer_tok_lst]
    padded_refer_tok_ids = [item[0] for item in padded_refer_tok_lst]
    padded_refer_attention_mask = [item[1] for item in padded_refer_tok_lst]

    candicated_padded_position_ids = []
    padded_ref_input_ids_lst, padded_ref_attention_mask_lst = [], []
    
    for chunk_pos_ids, chunk_spe_pos_lst in zip(end_positional_chunks, padded_chunk_pos_lst):
        tmp_chunk_pos_ids = []
        for tmp in chunk_spe_pos_lst:
            tmp = torch.concatenate([tmp, torch.tensor([chunk_pos_ids])], dim=0)
            tmp_chunk_pos_ids.append(tmp)  # [[0,1,...,C1], [C2,C2+1,...,C3], ...]
        candicated_padded_position_ids.append(tmp_chunk_pos_ids)
    candicated_padded_position_ids = combine_fn(candicated_padded_position_ids)

    for padded_chunk_tok_ref_input_ids, padded_chunk_tok_ref_attention_mask in zip(padded_refer_tok_ids, padded_refer_attention_mask):
        padded_chunk_tok_ref_input_ids = torch.concatenate([padded_chunk_tok_ref_input_ids, torch.tensor([special_token_id])], dim=0)
        padded_chunk_tok_ref_attention_mask = torch.concatenate([padded_chunk_tok_ref_attention_mask, torch.tensor([1])], dim=0)
        padded_ref_input_ids_lst.append(padded_chunk_tok_ref_input_ids)
        padded_ref_attention_mask_lst.append(padded_chunk_tok_ref_attention_mask)
    
    padded_ref_input_ids = torch.concatenate(padded_ref_input_ids_lst, dim=0)
    padded_ref_attention_mask = torch.concatenate(padded_ref_attention_mask_lst, dim=0)
    all_spe_pos = torch.arange(real_max_chunk_size, real_reference_size, real_max_chunk_size + 1)

    return candicated_padded_position_ids, padded_ref_input_ids, padded_ref_attention_mask, all_spe_pos


def create_qa(QUESTION_TEMPLATE, ANSWER_TEMPLATE, combined_question, combined_answer, prefix_a: str, suffix_a: str, last_position: int, qa_size: int, special_token_id: int):
    """
    last_position 是reference position ids最大的数值，下面的代码要加一个 last_position + 1的shift
    qa_size 规定了最大的qa 的长度，所以总长度需要手动卡一下
    """
    # Create Question
    question = QUESTION_TEMPLATE.format(question=combined_question)
    tok_question = tokenizer(question, return_tensors="pt", add_special_tokens=False).input_ids[0]
    padded_tok_question, padded_question_attention_mask = auto_padding(tok_question, tok_question.size(-1), filling_value=0, return_attention_mask=True)
    padded_tok_question = torch.concatenate([padded_tok_question, torch.tensor([special_token_id])], dim=0)
    padded_question_attention_mask = torch.concatenate([padded_question_attention_mask, torch.tensor([1])], dim=0)
    question_position_input_ids = create_position_ids(tok_question.size(-1), tok_question.size(-1)) + last_position + 1
    last_pos = question_position_input_ids.max() + 1
    question_position_input_ids = torch.concatenate([question_position_input_ids, torch.tensor([last_pos])], dim=0)
    spe_tok_pos = question_position_input_ids.size(-1) - 1

    # Create Chosen / Rejected Answers / and their labels
    chosen_answer = ANSWER_TEMPLATE.format(answer=combined_answer)
    prefix_rejected_answer = ANSWER_TEMPLATE.format(answer=prefix_a)
    suffix_rejected_answer = ANSWER_TEMPLATE.format(answer=suffix_a)
    tok_chosen_answer = tokenizer(chosen_answer, return_tensors="pt", add_special_tokens=False).input_ids[0]
    tok_prefix_rejected_answer = tokenizer(prefix_rejected_answer, return_tensors="pt", add_special_tokens=False).input_ids[0]
    tok_suffix_rejected_answer = tokenizer(suffix_rejected_answer, return_tensors="pt", add_special_tokens=False).input_ids[0]

    system_reference_question_size = last_position + 1 + question_position_input_ids.size(-1)

    padded_tok_chosen_answer, padded_chosen_answer_attention_mask = auto_padding(tok_chosen_answer, qa_size-padded_question_attention_mask.size(-1), filling_value=0, return_attention_mask=True)
    tok_chosen_answer_labels = auto_padding(tok_chosen_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    chosen_answer_position_ids = create_position_ids(tok_chosen_answer.size(-1), tok_chosen_answer.size(-1)) + system_reference_question_size
    chosen_answer_position_ids = auto_padding(chosen_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)

    padded_tok_prefix_rejected_answer, padded_prefix_rejected_answer_attention_mask = auto_padding(tok_prefix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=0, return_attention_mask=True)
    tok_prefix_rejected_answer_labels = auto_padding(tok_prefix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    prefix_rejected_answer_position_ids = create_position_ids(tok_prefix_rejected_answer.size(-1), tok_prefix_rejected_answer.size(-1)) + system_reference_question_size
    prefix_rejected_answer_position_ids = auto_padding(prefix_rejected_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)
    
    padded_tok_suffix_rejected_answer, padded_suffix_rejected_answer_attention_mask = auto_padding(tok_suffix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=0, return_attention_mask=True)
    tok_suffix_rejected_answer_labels = auto_padding(tok_suffix_rejected_answer, qa_size - padded_question_attention_mask.size(-1), filling_value=-100)
    suffix_rejected_answer_position_ids = create_position_ids(tok_suffix_rejected_answer.size(-1), tok_suffix_rejected_answer.size(-1)) + system_reference_question_size
    suffix_rejected_answer_position_ids = auto_padding(suffix_rejected_answer_position_ids, qa_size - padded_question_attention_mask.size(-1), filling_value=0)
 
    return padded_tok_question, padded_question_attention_mask, question_position_input_ids, \
        padded_tok_chosen_answer, padded_chosen_answer_attention_mask, tok_chosen_answer_labels, chosen_answer_position_ids, \
        padded_tok_prefix_rejected_answer, padded_prefix_rejected_answer_attention_mask, tok_prefix_rejected_answer_labels, prefix_rejected_answer_position_ids, \
        padded_tok_suffix_rejected_answer, padded_suffix_rejected_answer_attention_mask, tok_suffix_rejected_answer_labels, suffix_rejected_answer_position_ids, spe_tok_pos
    

"""block testing create_chunked_reference""" 
tokenizer = transformers.AutoTokenizer.from_pretrained("/vepfs/wcf/hf_models/Meta-Llama-3-8B-Instruct")
all_refs = ["hello, world", "Any, iowpq", "reason medsa"]
real_reference_size = 28
system_prompt_size, qa_size = 2, 4
max_embedding_size = 34
candicated_padded_concat_position_ids, padded_ref_input_ids, padded_ref_attention_mask, all_spe_pos = create_chunked_reference(tokenizer, all_refs, real_reference_size, max_embedding_size, system_prompt_size, qa_size)

"""Test Create QA Function"""
QUESTION_TEMPLATE = "<|start_header_id|>user<|end_header_id|>\n\nPlease answer the following question according to the references: {question}<|eot_id|>"
ANSWER_TEMPLATE = "<|start_header_id|>assistant<|end_header_id|>\n\nThe answer is: {answer}<|eot_id|><|end_of_text|>"

padded_tok_question, padded_question_attention_mask, \
question_position_input_ids, padded_tok_chosen_answer, \
padded_chosen_answer_attention_mask, tok_chosen_answer_labels, \
chosen_answer_position_ids, padded_tok_prefix_rejected_answer, \
padded_prefix_rejected_answer_attention_mask, tok_prefix_rejected_answer_labels, \
prefix_rejected_answer_position_ids, padded_tok_suffix_rejected_answer, \
padded_suffix_rejected_answer_attention_mask, tok_suffix_rejected_answer_labels, \
suffix_rejected_answer_position_ids, spe_tok_pos = create_qa(QUESTION_TEMPLATE, ANSWER_TEMPLATE, "who are you", "jack", "prefix_a", "suffix_a", last_position=1024, qa_size=100, special_token_id=13)

print(padded_tok_question.shape)
print(padded_question_attention_mask.shape)
print(question_position_input_ids.shape)
print(padded_tok_chosen_answer.shape)
print(chosen_answer_position_ids.shape)
print(prefix_rejected_answer_position_ids.shape)
print(padded_tok_suffix_rejected_answer.shape)
print(padded_suffix_rejected_answer_attention_mask.shape)
print(suffix_rejected_answer_position_ids.shape)
print(tok_suffix_rejected_answer_labels.shape)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


torch.Size([19])
torch.Size([19])
torch.Size([19])
torch.Size([81])
torch.Size([81])
torch.Size([81])
torch.Size([81])
torch.Size([81])
torch.Size([81])
torch.Size([81])


In [None]:
print(candicated_padded_concat_position_ids[0])
print(padded_ref_input_ids)
print(padded_ref_attention_mask)
print(all_spe_pos)

In [4]:
def find_index(id_lst, prefix_id, suffix_id):
    return id_lst.index(prefix_id), id_lst.index(suffix_id)

def create_covering_position_ids(N, L):
    """Create sets of position IDs to cover all positions from 0 to L-1 with intervals of length N."""
    if N > L:
        raise ValueError("N should not be greater than L")
    num_intervals = (L + N - 1) // N
    position_ids_list = []
    for i in range(num_intervals):
        start_pos = i * (L - N) // (num_intervals - 1) if num_intervals > 1 else 0
        end_pos = start_pos + N
        if end_pos > L:
            end_pos = L
            start_pos = L - N if L > N else 0
        position_ids = torch.arange(start_pos, end_pos)
        position_ids_list.append(position_ids)
    return position_ids_list


def auto_padding(t: torch.Tensor, length: int, filling_value=-100, return_attention_mask=False):
    if length < t.size(0):
        if return_attention_mask: return t[:length]
        else: return t[:length], torch.ones_like(t[:length])
    padded_tensor = torch.full((length,), filling_value, dtype=t.dtype)
    padded_tensor[:t.size(0)] = t
    if return_attention_mask:
        attention_mask = torch.zeros(length, dtype=torch.int)
        attention_mask[:t.size(0)] = 1
        return padded_tensor, attention_mask
    return padded_tensor


def create_covering_position_ipt_data(tokenizer, all_refs: List[str], combined_question: str, combined_answer: str, prefix_a: str, suffix_a: str, qa_size: int, max_embedding_size: int, real_reference_size: int, special_token_id: int = None, prefix_id: int = None, suffix_id: int = None):
    statistic_data_size = []

    SYSTEM_SUFFIX = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nBelow is some references. Please read it carefully and answer the following question.<|eot_id|>"
    QUESTION_TEMPLATE = "<|start_header_id|>user<|end_header_id|>\n\nPlease answer the following question according to the references: {question}<|eot_id|>"
    ANSWER_TEMPLATE = "<|start_header_id|>assistant<|end_header_id|>\n\nThe answer is: {answer}<|eot_id|><|end_of_text|>"

    # Create System Suffix
    padded_tok_input_ids_system_suffix, padded_attention_mask_system_suffix, padded_position_ids_system_suffix = create_system_suffix(tokenizer, SYSTEM_SUFFIX, special_token_id)
    system_prompt_size = padded_attention_mask_system_suffix.size(-1)
    all_spe_pos = [system_prompt_size-1]
    # create chunk reference (input_ids, attention_mask and positional ids)
    candicated_padded_position_ids_lst, padded_ref_input_ids, padded_ref_attention_mask, ref_spe_pos = create_chunked_reference(tokenizer, all_refs, real_reference_size, max_embedding_size, system_prompt_size, qa_size, special_token_id)

    ref_spe_pos += system_prompt_size
    all_spe_pos.extend(ref_spe_pos.tolist())

    # combine and wrap each position_id, input_ids and attention_mask
    last_position = max_embedding_size - qa_size  # size for real reference and system prompt

    # Create Question, all Answers
    padded_tok_question, padded_question_attention_mask, \
    question_position_input_ids, padded_tok_chosen_answer, \
    padded_chosen_answer_attention_mask, tok_chosen_answer_labels, \
    chosen_answer_position_ids, padded_tok_prefix_rejected_answer, \
    padded_prefix_rejected_answer_attention_mask, tok_prefix_rejected_answer_labels, \
    prefix_rejected_answer_position_ids, padded_tok_suffix_rejected_answer, \
    padded_suffix_rejected_answer_attention_mask, tok_suffix_rejected_answer_labels, \
    suffix_rejected_answer_position_ids, spe_tok_pos = create_qa(
        QUESTION_TEMPLATE, ANSWER_TEMPLATE, combined_question, combined_answer, prefix_a, suffix_a, last_position, qa_size, special_token_id=special_token_id
    )
    all_spe_pos.append(spe_tok_pos + system_prompt_size + padded_ref_input_ids.size(-1))
    # all_spe_pos = torch.concatenate([all_spe_pos, torch.tensor([spe_tok_pos])], dim=0)

    # Merge All the Inputs Data
    all_datasets = []  # different combination of positions 
    # if len(candicated_padded_position_ids_lst) > 8:
    #     print(len(candicated_padded_position_ids_lst))
    #     print(len(all_refs))
    for i, ref_position_id in enumerate(candicated_padded_position_ids_lst):
        concatenated_batch = {}
        concatenated_batch["input_ids"] = torch.concatenate([padded_tok_input_ids_system_suffix, padded_ref_input_ids, padded_tok_question], dim=0)
        concatenated_batch["attention_mask"] = torch.concatenate([padded_attention_mask_system_suffix, padded_ref_attention_mask, padded_question_attention_mask], dim=0)
        concatenated_batch["position_ids"] = torch.concatenate([padded_position_ids_system_suffix, ref_position_id, question_position_input_ids], dim=0)
        referece_question_length = concatenated_batch["attention_mask"].size(-1)
        concatenated_batch["all_spe_pos"] = all_spe_pos
        referece_question_labels = torch.full((1, referece_question_length), -100)[0]
        
        # Create Labels for Each Part
        concatenated_batch["chosen_answer"] = {
            "input_ids": padded_tok_chosen_answer, 
            "attention_mask": padded_chosen_answer_attention_mask, 
            "labels": torch.concatenate([referece_question_labels, tok_chosen_answer_labels], dim=0),
            "position_ids": chosen_answer_position_ids
        }
        concatenated_batch["prefix_rejected_answer"] = {
            "input_ids": padded_tok_prefix_rejected_answer, 
            "attention_mask": padded_prefix_rejected_answer_attention_mask, 
            "labels": torch.concatenate([referece_question_labels, tok_prefix_rejected_answer_labels], dim=0),
            "position_ids": prefix_rejected_answer_position_ids,
        }
        concatenated_batch["suffix_rejected_answer"] = {
            "input_ids": padded_tok_suffix_rejected_answer, 
            "attention_mask": padded_suffix_rejected_answer_attention_mask, 
            "labels": torch.concatenate([referece_question_labels, tok_suffix_rejected_answer_labels], dim=0),
            "position_ids": suffix_rejected_answer_position_ids,
        }
        concatenated_batch["chosen_ids"] = (prefix_id, suffix_id)
        all_datasets.append(concatenated_batch)
        statistic_data_size.append(concatenated_batch["input_ids"].size(-1))

    return all_datasets, sum(statistic_data_size) / len(statistic_data_size)


dataset = datasets.load_from_disk("/vepfs/wcf/G/zecheng/data/hf_dataset_step2")
print(dataset)

tokenizer = transformers.AutoTokenizer.from_pretrained("/vepfs/wcf/G/zecheng/hf_models2/Meta-Llama-3-8B-Instruct")
training_samples = []
avg_real_seq_length = 0
spe_token_id = tokenizer("<|reserved_special_token_0|>", add_special_tokens=False).input_ids[0]
with tqdm(total=len(dataset), desc=f"Initial Avg Length: {avg_real_seq_length}") as pbar:
    for item in dataset:
        all_ref_text = item["all_ref_text"]
        combined_question, final_answer = item["combined_question"], item["final_answer"]
        prefix_q, suffix_q = item["prefix_q"], item["suffix_q"]
        prefix_a, suffix_a = item["prefix_a"], item["suffix_a"]
        prefix_id, suffix_id = find_index(item["all_ref_ids"], item["prefix_id"], item["suffix_id"])
        all_datasets, ref_length = create_covering_position_ipt_data(tokenizer, all_ref_text, combined_question, final_answer, prefix_a, suffix_a, qa_size=512, max_embedding_size=65536, real_reference_size=16384, special_token_id=spe_token_id, prefix_id=prefix_id, suffix_id=suffix_id)
        avg_real_seq_length += ref_length / len(dataset)
        training_samples.extend(all_datasets)
        pbar.set_description(f"Current Avg Seq Length: {avg_real_seq_length:.2f}")
        pbar.update(1)

print(len(training_samples))

Dataset({
    features: ['id', 'prefix_q', 'prefix_a', 'suffix_q', 'suffix_a', 'prefix_id', 'suffix_id', 'all_ref_ids', 'combined_question', 'all_ref_text', 'final_answer'],
    num_rows: 878
})


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Current Avg Seq Length: 16436.81: 100%|██████████| 878/878 [00:39<00:00, 22.38it/s]

14040





In [None]:
all_length = 0
with tqdm(total=len(dataset)) as pbar:
    for sample in dataset:
        all_length += sum([tokenizer(item, return_tensors="pt", add_special_tokens=False).input_ids.size(-1) for item in sample["all_ref_text"]]) / len(dataset)
        pbar.update(1)
print(all_length)

In [5]:
# from modelzipper import *
# import pandas as pd
# import datasets
# import numpy as np

converted_dict = {}
for item in training_samples:
    for key, value in item.items():
        if key in converted_dict:
            converted_dict[key].append(value)
        else:
            converted_dict[key] = [value]

hf_datasets = datasets.Dataset.from_dict(converted_dict)
hf_datasets.save_to_disk("/vepfs/wcf/G/zecheng/data/processed_data/hf_dataset_8k")

In [6]:
hf_datasets.save_to_disk("/vepfs/wcf/G/zecheng/data/processed_data/hf_dataset_tmp2")

Saving the dataset (19/19 shards): 100%|██████████| 14040/14040 [00:03<00:00, 4472.94 examples/s]


In [None]:
import datasets

hf_datasets = datasets.load_from_disk("/data/zecheng/data/process_wiki_document/two_hop/hf_dataset_tmp2")

In [None]:
print(hf_datasets)
print(hf_datasets[0]['all_spe_pos'])
print(hf_datasets[0]['chosen_ids'])
for idx in hf_datasets[0]['all_spe_pos']:
    print(idx)
    print(hf_datasets[0]['attention_mask'][idx-20: idx+1])
    print(hf_datasets[0]['input_ids'][idx-20: idx+1])
    print(tokenizer.decode(hf_datasets[0]['input_ids'][idx-20: idx+1]))