In [2]:
from modelzipper.tutils import *
import datasets
from pprint import pprint
import transformers
from datasets import load_dataset
import json
from nltk.tokenize import sent_tokenize
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import spacy

tokenizer =  transformers.AutoTokenizer.from_pretrained("/data/zecheng/hf_models/Meta-Llama-3-8B-Instruct")
# bio_book_data = datasets.load_dataset('json', data_files="/data/zecheng/data/long-llm/gpt/bio_book.train.json")
# nlp = spacy.load("en_core_web_lg")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def extract_entities(texts, add_none=False):
    doc = nlp(texts)
    verb = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    ents = [chunk.text for chunk in doc.ents]
    if add_none:
        none = [token.lemma_ for token in doc if token.pos_ == "NOUN"]
        return set(verb + ents + none)
    return set(verb + ents)

def split_chunks(text, tokenizer, chunk_length=1024):
    tok_text = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids[0]
    chunk_ids = [tok_text[i: i+chunk_length] for i in range(0, len(tok_text), chunk_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunk_ids]


def is_entity_match(question, answer, context):
    question_entities = extract_entities(question, add_none=False)
    answer_entities = extract_entities(answer, add_none=False)
    all_qa_entities = question_entities.union(answer_entities)
    context_entities = extract_entities(context, add_none=True)
    return len(all_qa_entities.intersection(context_entities)) == len(all_qa_entities)


def match_chunks_with_question_answer(context_chunks, qa_pairs):
    context_ids = list(range(len(context_chunks)))
    matched_chunks = []
    for question, answer in qa_pairs:
        evidence_chunk_ids = []
        for i, chunk in enumerate(context_chunks):
            if is_entity_match(question, answer, chunk):
                evidence_chunk_ids.append(i)
        if len(evidence_chunk_ids) > 0:
            matched_chunks.append((question, answer, evidence_chunk_ids))

        matched_chunks.append(({"question": question, "answer": answer, "evidence_chunk_ids": chunk}))


def process_dataset(dataset, tokenizer, chunk_length=1024):
    for item in dataset:
        conversations = item['conversations']
        conversation_turns = range(2, len(conversations), 2)  # first conversation is context and summarization
        all_qa_pairs = []
        # process first converstion turn
        context = "\n".join(conversations[0]['content'].split("\n")[:-1])
        context_chunks = split_chunks(context, tokenizer, chunk_length)
        
        first_question = conversations[0]['content'].split("\n")[-1]
        first_answer = conversations[1]['content']  # assistant answer
        all_qa_pairs.append((first_question, first_answer))

        for turn in conversation_turns:
            question = conversations[turn]['content']
            answer = conversations[turn+1]['content']
            all_qa_pairs.append((question, answer))

In [None]:
first_sample = '\n'.join(bio_book_data['train'][0]['conversations'][0]['content'].split("\n")[:-1])
tokenied_first_sample = tokenizer(first_sample, return_tensors="pt").input_ids
# print(tokenied_first_sample.size(-1))

In [None]:
bio_book_data['train'][0]['conversations'][3]

## process sunzc

In [None]:
import datasets
from modelzipper.tutils import *
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("/data/zecheng/hf_models/Meta-Llama-3-8B-Instruct")
data = auto_read_data("/data/zecheng/sunzc/multi_hop/douzhicheng/data/one_detail_book/tmp_process/all.jsonl")

In [None]:
def statistic_source(sample, top_k=8):
    ctx_ids, qa_pairs = sample["ids"], sample["qa_pairs"]
    cnt_ctx_ids = dict((k, [0, []]) for k in ctx_ids)
    for qa_id, qa_pair in enumerate(qa_pairs):
        env_ids = qa_pair["evidence_ids"]
        for id in env_ids:
            cnt_ctx_ids[id][0] += 1
            cnt_ctx_ids[id][1].append(qa_id)
    sorted_cnt_ctx_ids = sorted(cnt_ctx_ids.items(), key=lambda item: item[1][0], reverse=True)
    top_keys = [item for item in sorted_cnt_ctx_ids[:top_k]]

    if not is_more_than_K_evidences(top_keys, k=2):
        return None
    else:
        top_keys.sort(key=lambda x: x[0])
        all_context_ids = [item[0] for item in top_keys]
        all_qa_pair_ids = [item[1][1] for item in top_keys]
        all_qa_pair_ids = list(set([item for sublist in all_qa_pair_ids for item in sublist]))
        all_context_str = [sample["all_context"][i] for i in all_context_ids]
        all_qa_pair_str = [post_process_question(qa_pairs[i]) for i in all_qa_pair_ids]
        all_qa_pair_str = [item for item in all_qa_pair_str if item["type"]=="multi_source"]
    
    return {
        "all_context_ids": all_context_ids,
        "all_context_str": all_context_str,
        "all_qa_pair_str": all_qa_pair_str,
    }


def post_process_question(qa_pairs):
    if len(qa_pairs["evidence_ids"]) == 1:
        qa_pairs["type"] = "single_source"
    else:
        qa_pairs["type"] = "multi_source"
    return qa_pairs


def is_more_than_K_evidences(sample, k=4):
    min_K = min([item[1][0] for item in sample])
    if min_K < k:
        return False
    return True

def return_all_evidences(sample):
    return [item[0] for item in sample]


def filter_from_source(file_path, top_k=16):
    all_qa_pairs = 0
    data = auto_read_data(file_path)
    upsample_data = []
    for sample in data:
        top_ctx_ids = statistic_source(sample, top_k=top_k)
        if top_ctx_ids is not None:
            num_qa_pairs = len(top_ctx_ids["all_qa_pair_str"])
            all_qa_pairs += num_qa_pairs
            upsample_data.append(top_ctx_ids)
    if len(upsample_data) == 0:
        return [], 0
    return upsample_data, all_qa_pairs / len(upsample_data)


DIR_PATH = "/data/zecheng/sunzc/multi_hop/douzhicheng/data"
ALL_PATHS = ["one_detail_book", "one_detail_paper", "multi_detail_paper_long"]
COMBINED_PATHS = [f"{DIR_PATH}/{sub_path}/tmp_process/all.jsonl" for sub_path in ALL_PATHS] 

all_data = []

for data_name, file_path in zip(ALL_PATHS, COMBINED_PATHS):
    upsample_data, avg_qa_pairs = filter_from_source(file_path, top_k=16)
    all_data.extend(upsample_data)
print(len(all_data))

In [None]:
## rename the keys
for sample in all_data:
    all_context_ids = sample["all_context_ids"]
    new_qa_pairs = []
    for qa_pair in sample["all_qa_pair_str"]:
        evidence_ids = qa_pair.pop("evidence_ids")
        new_evidence_ids = []
        for id in evidence_ids:
            if id not in all_context_ids: continue
            new_evidence_ids.append(all_context_ids.index(id))
        if len(new_evidence_ids) < 2:
            continue
        else:
            qa_pair["evidence_ids"] = new_evidence_ids
            new_qa_pairs.append(qa_pair)
    sample["all_qa_pair_str"] = new_qa_pairs
    new_context_ids = list(range(len(all_context_ids)))
    sample["all_context_ids"] = new_context_ids

print(len(all_data))

In [None]:
all_data[0]

In [None]:
pprint(all_data[0]["all_context_ids"])  # 6, 14
pprint(all_data[0]["all_qa_pair_str"][0])
pprint(all_data[-10]["all_qa_pair_str"][0].keys())
real_id1, real_id2 = all_data[-10]["all_context_ids"].index(6), all_data[-10]["all_context_ids"].index(14)
# pprint(all_data[-10]["all_context_str"][real_id1])
# pprint(all_data[-10]["all_context_str"][real_id2])

In [None]:
auto_save_data(all_data, f"/data/zecheng/data/processed_project/16chunk/1024_chunk_size/step1_data.jsonl")

## process stage 2: mask paritial data, and filtering

In [None]:
chunk_data = auto_read_data("/data/zecheng/data/processed_project/16chunk/1024_chunk_size/step1_data.jsonl")

def find_real_context(all_context_str, real_ids, id_dict):
    all_context = []
    for id in real_ids:
        if id not in id_dict:
            continue
        all_context.append(all_context_str[id_dict[id]])
    if len(all_context) < 2:
        return None
    return all_context

def process_per_sample(sample):
    all_context_ids, all_qa_pair_str, all_context_str = sample["all_context_ids"], sample["all_qa_pair_str"], sample["all_context_str"]
    real_ids = dict([(i, j) for j, i in enumerate(all_context_ids)])  # context id to context index
    for item in all_qa_pair_str:
        evidence_ids = item["evidence_ids"]
        all_context = find_real_context(all_context_str, evidence_ids, real_ids)
        if all_context is None:
            continue
        item["evidence_context"] = all_context
    all_qa_pair_str = [item for item in all_qa_pair_str if "evidence_context" in item]
    return sample

processed_data = [process_per_sample(item) for item in chunk_data]
auto_save_data(processed_data, f"/data/zecheng/data/processed_project/16chunk/1024_chunk_size/step2_data.jsonl")

In [None]:
processed_data[0]

In [None]:
def chunk_list(data, num_chunks=8):
    chunk_size = len(data) // num_chunks
    remainder = len(data) % num_chunks
    chunks = []
    start = 0
    for i in range(num_chunks):
        end = start + chunk_size + (1 if i < remainder else 0)
        chunks.append(data[start:end])
        start = end
    return chunks

divided_data = chunk_list(processed_data, 4)

for i, chunk in enumerate(divided_data):
    auto_save_data(chunk, f"/data/zecheng/data/processed_project/16chunk/1024_chunk_size/split_step2_data/chunk_{i}.jsonl")

In [None]:
print(len(divided_data[0][1]['all_qa_pair_str']))

## Filtering Model Generated Results

In [7]:
def filter_one_sample(sample):
    """
    remove 一段chunk之后再让模型进行回复，如果回复的分数很高，那么这段chunk就是不重要的（体现在F1指标上）
    如果模型回复的F1分数很低，那么这段chunk就是重要的
    这里是根据最低的分数找到的rejected answer, 但是实际上，也许找到最高分数比较合理，因为最高分数下，模型看到的内容是越多的
    但是和rejected answer的关系是越少的
    """

    filtered_qa_pairs = []
    for qa_pair in sample["all_qa_pair_str"]:
        evidence_ids = qa_pair["evidence_ids"]
        remove_gen, sort_f1_dict = qa_pair["remove_gen"], qa_pair["sort_f1_dict"]
        all_f1_scores = list(sort_f1_dict.values())
        if min(all_f1_scores) > 0.15:
            continue
        all_keys = list(sort_f1_dict.keys())
        highest_f1_index = evidence_ids.index(int(all_keys[-1]))
        lowerest_f1_index = evidence_ids.index(int(all_keys[0]))
        rejected_answer_w_info = remove_gen[lowerest_f1_index]  # 这里是一个超参数，我可以选多个进行rejected，目前先选择一个，为了训练效率
        rejected_answer_wo_info = remove_gen[highest_f1_index] 
        filtered_qa_pairs.append(
            {
                "question": qa_pair["question"],
                "chosen_answer": qa_pair["answer"],
                "rejected_answer_w_info": rejected_answer_w_info,
                "rejected_answer_wo_info": rejected_answer_wo_info,
            }
        )
    return {
        "all_context_lst": sample["all_context_str"], 
        "filtered_qa_pairs": filtered_qa_pairs
    }


data = auto_read_data("/data/zecheng/sunzc/multi_hop/douzhicheng/data/split_step2_data_withF1.jsonl")
filtered_data = [filter_one_sample(item) for item in data]
avg_len_qa_pairs = sum([len(item["filtered_qa_pairs"]) for item in filtered_data]) / len(filtered_data)
print(avg_len_qa_pairs)
print(len(filtered_data))
print(filtered_data[0].keys())

# auto_save_data(filtered_data, "/data/zecheng/data/processed_project/1024_chunk_size/step3_data.jsonl")

0
1
2
4.586666666666667
225
dict_keys(['all_context_lst', 'filtered_qa_pairs'])


In [6]:
filtered_data[0]

{'all_context_lst': ['You\'re a smart book reader. You\'re required to read through the following book and help me with my questions.\n\nShe probably never would. She didn\'t respect anyone who worked for her grandmother, even though at one time she\'d used business to try to gain Thora\'s acceptance. When her father\'s health had compelled her to return, she\'d asked Thora for a favor, the chance for some respect. But despite her MBA, her grandmother had refused to give her anything, let alone the role Elena had wanted running the company. She realized now that she\'d been foolish to even ask, to give her grandmother more leverage with which to hurt her.\n\nHer husband worked for Jones Inc., though, far beneath Thora and Joseph\'s level. Is that what had changed him from the sweet, fun-loving boy she\'d met in college eleven years ago? Elena doubted anyone could stay sweet and fun loving around Thora, least of all someone as weak as Kirk. Because he was weak, she couldn\'t fathom why 

### Transfer to the same data format as two-hop data
#### Dataformat:

1. all_ref_text: List[str]
2. combined_question: str
3. final_answer: str
4. prefix_q, suffix_q: str, str
5. prefix_a, suffix_a: str, str
6. prefix_id, suffix_id: int, int

In [9]:
# filtered_data = auto_read_data("/data/zecheng/data/processed_project/1024_chunk_size/step3_data.jsonl")
from datasets import Dataset

def convert_jsonl_to_dict(data, format="hf"):
    data_dict = {key: [dic[key] for dic in data] for key in data[0]}
    if format == "hf":
        data_dict = Dataset.from_dict(data_dict)
    return data_dict
    

new_data_format = []
for item in filtered_data:
    per_sample_data = {"all_ref_text": item["all_context_lst"]}
    for qa_pair in item["filtered_qa_pairs"]:
        per_sample_data["combined_question"] = qa_pair["question"]
        per_sample_data["final_answer"] = qa_pair["chosen_answer"]
        per_sample_data["prefix_q"] = qa_pair["question"]
        per_sample_data["suffix_q"] = qa_pair["question"]
        per_sample_data["prefix_a"] = qa_pair["rejected_answer_w_info"]
        per_sample_data["suffix_a"] = qa_pair["rejected_answer_wo_info"]
        new_data_format.append(per_sample_data)

print(len(new_data_format))
hf_data = convert_jsonl_to_dict(new_data_format, format="hf")
if os.path.exists("/data/zecheng/data/processed_project/16chunk/1024_chunk_size/hf_data_step3"):
    os.rmdir("/data/zecheng/data/processed_project/16chunk/1024_chunk_size/hf_data_step3")
hf_data.save_to_disk("/data/zecheng/data/processed_project/16chunk/1024_chunk_size/hf_data_step3")

1032


Saving the dataset (1/1 shards): 100%|██████████| 1032/1032 [00:00<00:00, 19791.33 examples/s]


In [12]:
from datasets import Dataset
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("/data/zecheng/hf_models/Llama-3-8B-Instruct-80K-QLoRA-Merged")

def create_chunks(s, tokenizer, chunk_nums):
    tok_s = tokenizer(s, return_tensors="pt", add_special_tokens=False).input_ids[0]
    if tok_s.size(-1) <= 8192: return None
    chunk_length = tok_s.size(-1) // chunk_nums
    chunked_s = [tok_s[i: i+chunk_length] for i in range(0, len(tok_s), chunk_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_s]

def convert_jsonl_to_dict(data, format="hf"):
    data_dict = {key: [dic[key] for dic in data] for key in data[0]}
    if format == "hf":
        data_dict = Dataset.from_dict(data_dict)
    return data_dict

data = auto_read_data("/data/zecheng/sunzc/result/dzc_res/src/bad_case_0.2_context.jsonl")

new_data_format = []
for item in data:
    chunks = create_chunks(item["context"], tokenizer, 8)
    if chunks is not None and len(item["question"]) > 0:
        per_sample_data = {"all_ref_text": create_chunks(item["context"], tokenizer, 8)}
        per_sample_data["combined_question"] = item["question"]
        per_sample_data["final_answer"] = item["gt_answers"]
        per_sample_data["prefix_q"] = item["question"]
        per_sample_data["suffix_q"] = item["question"]
        per_sample_data["prefix_a"] = item["s1_pred"]
        per_sample_data["suffix_a"] = item["s1_pred"]
        new_data_format.append(per_sample_data)

print(len(new_data_format))
hf_data = convert_jsonl_to_dict(new_data_format, format="hf")
if os.path.exists("/data/zecheng/data/processed_project/mix_chunks_v3/aug_split"):
    os.rmdir("/data/zecheng/data/processed_project/mix_chunks_v3/aug_split")
hf_data.save_to_disk("/data/zecheng/data/processed_project/mix_chunks_v3/aug_split")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


371


Saving the dataset (1/1 shards): 100%|██████████| 371/371 [00:00<00:00, 18444.48 examples/s]


In [15]:
new_data_format[203]

{'all_ref_text': ['Passage:\nGaius Octavius (proconsul)\nGaius Octavius  (about 100 – 59 BC) was an ancestor to the Roman Emperors of the Julio-Claudian dynasty. He is the father of the Emperor Augustus, step-grandfather of the Emperor Tiberius, great-grandfather of the Emperor Claudius, great-great grandfather of the Emperor Caligula, and great-great-great grandfather of the Emperor Nero. Hailing from Velitrae, he descended from an old, wealthy equestrian branch of the gens Octavia. Despite being from a wealthy family, his family was plebeian, rather than patrician. As a novus homo ("new man"), he would not be of a senatorial family.\n\nHis grandfather, Gaius Octavius, fought as a military tribune in Sicily during the Second Punic War. His father Gaius Octavius was a municipal magistrate who lived to an advanced age. He is a distant relative (possibly as third cousins, through their ancestor Gnaeus Octavius Rufus) to Gnaeus Octavius, the consul of 87 BC who led the opposition to Luciu