In [None]:
from modelzipper.tutils import *
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import random
import multiprocessing
from multiprocessing import Process, Manager


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
dir_path = "/mnt/petrelfs/tangzecheng/local_data/processed_multi_hop/random_drop/llama"
dataset_path = "/mnt/petrelfs/tangzecheng/local_data/processed_multi_hop/filter_en"
# tokenizer = AutoTokenizer.from_pretrained("/data/hf_models/Meta-Llama-3.1-8B-Instruct")
# dir_path = "/data/pub_data/check_inference/check_inference/llama"
# dataset_path = "/data/pub_data/processed_multi_hop/filter_en"
file_names = auto_read_dir(dir_path)
file_names.sort()
content_drop_1 = auto_read_data(os.path.join(dir_path, file_names[0]))
content_drop_2 = auto_read_data(os.path.join(dir_path, file_names[1]))
content_drop_3 = auto_read_data(os.path.join(dir_path, file_names[2]))

all_file_names = auto_read_dir(dataset_path)
content = []
for file_name in all_file_names:
    content.extend(auto_read_data(os.path.join(dataset_path, file_name)))

In [7]:
def construct_sample(content, num_sample, tokenizer=None, add_meta_info=False, return_list=[]):
    cnt = 0
    for item in tqdm(content): 
        model_pred = item["pred"][0].split('\n\n')[0]
        if tokenizer:
            pred_ids = tokenizer(model_pred, return_tensors="pt", add_special_tokens=False).input_ids
            if pred_ids.size(-1) < 50 or pred_ids.size(-1) > 400:
                continue
        elif len(model_pred) < 100: ## 首先检查输出是否符合预期格式
            continue
        
        if add_meta_info:
            all_clues = [i['content'] for i in item['meta_data']['clue_docs']]
        else:
            all_clues = []

        return_list.append({
            "prompt": item["prompt"],
            "chosen": [
                {"role": "user", "content": item["prompt"]}, 
                {"role": "assistant", "content": item["answer"]}
            ],
            "rejected": [
                {"role": "user", "content": item["prompt"]}, 
                {"role": "assistant", "content": model_pred}
            ],
            "meta_info": all_clues,
        })
        
        cnt += 1
        if cnt >= num_sample:
            break
    print(f"number of samples: {cnt}")

### 下面的数据集仅用来进行测试开发使用，一共32条训练，32条测试

In [None]:
# 首先从每个数据集中取相同数目的数据出来
all_training_data = []

all_content_data = content_drop_1[:64]
construct_sample(all_content_data, 3200, tokenizer, True, all_training_data)

dataset = Dataset.from_list(all_training_data)

# 从中随机抽取 32 条作为 validation 数据
validation_size = 32

# 打乱数据集索引并取前 32 条作为 validation
indices = list(range(len(dataset)))
random.shuffle(indices)

validation_indices = indices[:validation_size]
train_indices = indices[validation_size:]

# 使用 Hugging Face 的 select 方法创建新的训练集和验证集
train_dataset = dataset.select(train_indices)
validation_dataset = dataset.select(validation_indices)

# 将 train 和 validation 数据集保存到一个 dict 中
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

# 打印数据集长度验证
print(f"Train dataset size: {len(dataset_dict['train'])}")
print(f"Validation dataset size: {len(dataset_dict['validation'])}")

# 保存到本地
dataset_dict.save_to_disk("/mnt/petrelfs/tangzecheng/local_data/processed_multi_hop/random_drop/train_llama_data/merge_v1_w_clues_dev")

### 构造真实训练数据集
1. 添加evidence，辅助定位隐式推理过程中的evidence位置

In [None]:
# 从每个子集里面sample 3200条数据再合并到一起
random.shuffle(content_drop_1)
random.shuffle(content_drop_2)
random.shuffle(content_drop_3)

manager = Manager()
return_list = manager.list()

# 创建进程
process1 = multiprocessing.Process(target=construct_sample, args=(content_drop_1, 3200, tokenizer, True, return_list))
process2 = multiprocessing.Process(target=construct_sample, args=(content_drop_2, 3200, tokenizer, True, return_list))
process3 = multiprocessing.Process(target=construct_sample, args=(content_drop_3, 3200, tokenizer, True, return_list))

# 启动进程
process1.start()
process2.start()
process3.start()

# 等待所有进程完成
process1.join()
process2.join()
process3.join()

all_training_data = list(return_list)
random.shuffle(all_training_data)

dataset = Dataset.from_list(all_training_data)
validation_size = 600
indices = list(range(len(dataset)))
random.shuffle(indices)

validation_indices = indices[:validation_size]
train_indices = indices[validation_size:]

# 使用 Hugging Face 的 select 方法创建新的训练集和验证集
train_dataset = dataset.select(train_indices)
validation_dataset = dataset.select(validation_indices)

# 将 train 和 validation 数据集保存到一个 dict 中
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

# 打印数据集长度验证
print(f"Train dataset size: {len(dataset_dict['train'])}")
print(f"Validation dataset size: {len(dataset_dict['validation'])}")

# 保存到本地
dataset_dict.save_to_disk("/mnt/petrelfs/tangzecheng/local_data/processed_multi_hop/random_drop/train_llama_data/merge_v1_w_clues")