In [11]:
from transformers import AutoTokenizer
from datasets import load_from_disk, load_dataset

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
   "/home/ysx/models/internlm-chat-7b",
   trust_remote_code=True
)

In [13]:
def generate_prompt(example):
    if example["input"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )

In [14]:
def generate_zy_prompt(example):
    prompt = "你是一个医生，可以对患者提出的问题给出详细有用的回答。\n\n"
    r = "\n\n".join(example)
    return f"{prompt}{r}"


In [15]:
def format_data(sample):
    r = generate_zy_prompt(sample['text'])
    result = tokenizer(r, max_length=1024, padding='max_length')
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
zydata = load_from_disk("/home/ysx/src/AI/llm_demo/data/datasets/zyya")

In [17]:
mapped_dataset = zydata.map(
    format_data,
    remove_columns=['text']
)

In [18]:
mapped_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [19]:
junk_data = load_dataset('json', data_files='/home/ysx/src/AI/llm_demo/data/json/alpaca_data_cleaned_archive.json')


In [26]:
junk_data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 51759
    })
})

In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 51759
    })
})

In [21]:
def format_alpaca_data(sample):
    r = generate_prompt(sample)
    result = tokenizer(r, max_length=1024, padding='max_length')
    result["labels"] = result["input_ids"].copy()
    return result

In [22]:
junk_dataset = junk_data.map(
    format_alpaca_data,
    remove_columns=['instruction', 'output', 'input']
)

Map:   0%|          | 116/51759 [00:00<00:45, 1135.94 examples/s]

Map: 100%|██████████| 51759/51759 [00:52<00:00, 978.33 examples/s] 


In [23]:
junk_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 51759
    })
})

In [25]:
mapped_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [32]:
from datasets import concatenate_datasets

# 假设有两个相同结构的数据集: dataset1 和 dataset2
combined_dataset = concatenate_datasets([mapped_dataset, junk_dataset['train']])

# 现在你可以使用 combined_dataset 进行处理和分析


In [33]:
combined_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 51859
})

In [34]:
# 保存为Hugging Face的datasets格式
combined_dataset.save_to_disk("../data/datasets/lora")

Saving the dataset (2/2 shards): 100%|██████████| 51859/51859 [00:01<00:00, 45408.59 examples/s]


In [35]:
ll = load_from_disk("/home/ysx/src/AI/llm_demo/data/datasets/lora")

In [36]:
ll

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 51859
})