# Merge all generated data

In [None]:
from modelzipper.tutils import *
import re
from pprint import pprint
import itertools
import random
import transformers
from datasets import Dataset, concatenate_datasets, DatasetDict, load_from_disk, load_dataset
import pandas as pd
import numpy as np

In [None]:
CHUNK_NUM = 64

def extract_qa_pairs(text):
    sp_text = text.split("####")
    if len(sp_text) < 9:
        return None
    q1, a1, q2, a2 = sp_text[2].strip(), sp_text[4].strip(), sp_text[6].strip(), sp_text[8].strip()
    # pattern = r"####Question \d+####(.*?)####Answer \d+####(.*?)(?=####Question \d+####|$)"
    # matches = re.findall(pattern, text, re.DOTALL)
   
    res = [{"question": q1, "answer": a1}, {"question": q2, "answer": a2}]
    # for i, match in enumerate(matches, 1):
    #     question_text, answer_text = match
    #     res.append({"question": question_text, "answer": answer_text})
    return res

def combine_data(data, chunk_num):
    combined_data = []
    for i in range(0, len(data), chunk_num):
        combined_data.append(data[i:i+chunk_num])
    return combined_data

all_data = load_from_disk("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/filtering_deepseek/filtered_wo_ana")

# files = auto_read_dir("/vepfs/wcf/G/zecheng/data/SlimPajama-6B", file_prefix="generated_QA_pairs_thread", file_suffix=".jsonl")
# all_data = [auto_read_data(file) for file in files]
# all_data = [item for sublist in all_data for item in sublist]

print(f"total pairs: {len(all_data)}")

processed_data = []
for item in all_data:
    ref, qa_pairs = item['reference'], item['qa_pairs']
    processed_data.append({"reference": ref, "qa_pairs": qa_pairs})

combined_data = combine_data(processed_data, chunk_num=CHUNK_NUM)
print(len(combined_data))

### 构建强化学习的数据


#### DPO 数据格式
return {
        "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
        "chosen": samples["response_j"],
        "rejected": samples["response_k"],
    }

##### 1) 任意格式数据构建
潜在问题: attention可能会直接进行attribution matching，而不是fuss matching


##### 2) 更加细粒度的数据构建模式
潜在问题：构建规则的判断，比如KNN等聚类算法

In [None]:
def random_select_from_combined_data(all_samples, num_cases=8, selected_cases=1):
    SEP_TOKEN = ' [Doc] '
    # TEMPLATE = "{reference}\n\nQuestions: {question}"
    cases = list(range(num_cases))
    combinations_list = list(itertools.combinations(cases, selected_cases))

    batch_data = []
    ref_lst = [item['reference'] for item in all_samples]
    
    for item in combinations_list:
        chosen_id = item[0]
        remain_case_ids = list(set(cases) - set((chosen_id,)))
        reject_id = random.choice(remain_case_ids)
        for i in range(len(all_samples[chosen_id])):
            for j in range(len(all_samples[reject_id])):
                question = all_samples[chosen_id]['qa_pairs'][i]['question']
                cur_sample = {
                    "reference_list": ref_lst, 
                    "question": question,
                    # "prompt": TEMPLATE.format(reference=references, question=question), 
                    "chosen": all_samples[chosen_id]['qa_pairs'][i]['answer'], 
                    "rejected": all_samples[reject_id]['qa_pairs'][j]['answer'],
                    "chosen_span_id": chosen_id, 
                    "rejected_span_id": reject_id,
                }
                batch_data.append(cur_sample)

    return batch_data


all_created_cases = []

with tqdm(total=len(combined_data)) as pbar:
    for c_data in combined_data:
        batch_data = random_select_from_combined_data(c_data, num_cases=len(c_data), selected_cases=1)
        all_created_cases += batch_data
        pbar.update(1)

print(f"finish, current data sample nums: {len(all_created_cases)}")

print(all_created_cases[0].keys())

In [None]:
train_dataset, valid_dataset = all_created_cases[500:], all_created_cases[:500]
train_df = pd.DataFrame(train_dataset)
valid_df = pd.DataFrame(valid_dataset)
trans_train_datasets = Dataset.from_pandas(train_df, split="train")
trans_valid_datasets = Dataset.from_pandas(valid_df, split="valid")

combined_datasets = DatasetDict({"train": trans_train_datasets, "valid": trans_valid_datasets})
save_path = f"/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/hf_data_{CHUNK_NUM}"
combined_datasets.save_to_disk(save_path)

In [None]:
combined_datasets['valid']

## Backup Code

In [None]:
# tokenizer = transformers.AutoTokenizer.from_pretrained("/vepfs/wcf/G/zecheng/hf_models/Mistral-7B-Instruct-v0.2")
# dataset = auto_read_data("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/dpo_data_v1.jsonl")

def convert_format(tmp):
    prompt, chosen, rejected = tmp["prompt"], tmp["chosen"], tmp["rejected"]
    # user_message = [{"role": "user", "content": prompt}]
    # user_message = tokenizer.apply_chat_template(user_message, tokenize=False)
    chosen_span_id, rejected_span_id = tmp['chosen_span_id'], tmp['rejected_span_id']
    return {
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
            "chosen_span_id": chosen_span_id,
            "rejected_span_id": rejected_span_id,
        }

transfer_datasets = [convert_format(item) for item in all_created_cases]
train_datasets, valid_datasets = transfer_datasets[500:], transfer_datasets[:500]

# auto_save_data(train_datasets, "/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/dpo_data_chat_train_v2.jsonl")
# auto_save_data(valid_datasets, "/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/dpo_data_chat_valid_v2.jsonl")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("/vepfs/wcf/G/zecheng/hf_models/Llama-2-7b-hf")

# train_datasets = pd.read_json("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/dpo_data_chat_train_v2.jsonl", lines=True)
# valid_datasets = pd.read_json("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/dpo_data_chat_valid_v2.jsonl", lines=True)
# trans_train_datasets = Dataset.from_dict(train_datasets)

train_df = pd.DataFrame(train_datasets)
valid_df = pd.DataFrame(valid_datasets)

# print(train_df.head())
# print(valid_df.head())

trans_train_datasets = Dataset.from_pandas(train_datasets, split="train")
trans_valid_datasets = Dataset.from_pandas(valid_datasets, split="valid")

combined_datasets = DatasetDict({"train": trans_train_datasets, "valid": trans_valid_datasets})
# all_data = load_from_disk("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/hf_data_v2")

train_data = combined_datasets['train']
valid_data = combined_datasets['valid']
print(train_data)
print(valid_data)

# total_length = 0
# max_length = 0
# min_length = 1e5

def map_func(sample):
    tok_seq = tokenizer(sample['prompt'], add_special_tokens=False)['input_ids']
    cur_length = len(tok_seq)
    sample["cur_length"] = cur_length
    return sample
    
process__data = combined_datasets.map(map_func, num_proc=24)

for k in ['train', 'valid']:
    print(f'-------- key: {k} --------')
    cur_data = process__data[k]
    lengths = cur_data["cur_length"]

    # 将少于 5000 的长度设为 5000，将多于 12000 的长度设为 12000
    lengths = np.clip(lengths, 5000, 12000)

    # 定义区间
    bins = np.arange(5000, 13001, 1000)

    # 计算每个区间的样本数量
    hist, bin_edges = np.histogram(lengths, bins=bins)

    # 计算每个区间的比例
    total_samples = len(lengths)
    proportions = hist / total_samples 

    # 打印结果
    for i in range(len(bins) - 1):
        print(f"Range {bins[i]} - {bins[i+1]}: {proportions[i]:.2%}")

def filter_func(sample):
    return sample["cur_length"] <= 16000  # filter with 16K length

filtered_train_data = process__data['train'].filter(filter_func, num_proc=24)
filtered_valid_data = process__data['valid'].filter(filter_func, num_proc=24)
filtered_train_data = filtered_train_data.remove_columns(["cur_length"])
filtered_valid_data = filtered_valid_data.remove_columns(["cur_length"])
filtered_combined_datasets = DatasetDict({"train": filtered_train_data, "valid": filtered_valid_data})

save_path = f"/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/hf_data_{CHUNK_NUM}"
filtered_combined_datasets.save_to_disk(save_path)

## 备份代码

备份代码

In [None]:
for k in ['train', 'valid']:
    print(f'-------- key: {k} --------')
    cur_data = process__data[k]
    lengths = cur_data["cur_length"]

    # 将少于 5000 的长度设为 5000，将多于 12000 的长度设为 12000
    lengths = np.clip(lengths, 5000, 12000)

    # 定义区间
    bins = np.arange(5000, 13001, 1000)

    # 计算每个区间的样本数量
    hist, bin_edges = np.histogram(lengths, bins=bins)

    # 计算每个区间的比例
    total_samples = len(lengths)
    proportions = hist / total_samples

    # 打印结果
    for i in range(len(bins) - 1):
        print(f"Range {bins[i]} - {bins[i+1]}: {proportions[i]:.2%}")

def filter_func(sample):
    return sample["cur_length"] <= 8000

filtered_train_data = process__data['train'].filter(filter_func, num_proc=36)
filtered_valid_data = process__data['valid'].filter(filter_func, num_proc=36)
filtered_train_data = filtered_train_data.remove_columns(["cur_length"])
filtered_valid_data = filtered_valid_data.remove_columns(["cur_length"])
filtered_combined_datasets = DatasetDict({"train": filtered_train_data, "valid": filtered_valid_data})

filtered_combined_datasets.save_to_disk("/vepfs/wcf/G/zecheng/data/SlimPajama-6B/dpo_data/hf_data_v2")