### tokenizer

In [1]:
import os
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,)
from peft import LoraConfig
from tqdm import tqdm
import torch
import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_dataset(filename):
    data_list = []
    with open(filename, "r", encoding="gb18030") as f:
        i = 0
        for line in f:
            i += 1
            if i < 10:
                print(line)
            try:
                dept, title, ques, ans = line.strip("\n").split(',', 4)
                data_list.append(
                    {
                    'department': dept,
                    'input': ques,
                    'output': ans
                    }
                )
            except:
                pass
    return data_list

In [3]:
data_list = load_dataset("./Chinese-medical-dialogue-data-master/Data_数据/Pediatric_儿科/儿科5-14000.csv")

print(len(data_list))

department,title,ask,answer

营养保健科,小儿肥胖超重该如何治疗,女宝宝，刚7岁，这一年，察觉到，我家孩子身上肉很多，而且，食量非常的大，平时都不喜欢吃去玩，请问：小儿肥胖超重该如何治疗。,孩子出现肥胖症的情况。家长要通过孩子运功和健康的饮食来缓解他的症状，可以先让他做一些有氧运动，比如慢跑，爬坡，游泳等，并且饮食上孩子多吃黄瓜，胡萝卜，菠菜等，禁止孩子吃一些油炸食品和干果类食物，这些都是干热量高脂肪的食物，而且不要让孩子总是吃完就躺在床上不动，家长在治疗小儿肥胖期间如果孩子情况严重就要及时去医院在医生的指导下给孩子治疗。

营养保健科,小儿肥胖超重该怎样医治,男孩子，刚4岁，最近，发现，我家孩子体重要比别的孩子重很多，而且，最近越来越能吃了，还特别的懒，请问：小儿肥胖超重该怎样医治。,孩子一旦患上肥胖症家长要先通过运动和饮食来改变孩子的情况，要让孩子做一些他这个年龄段能做的运动，如游泳，慢跑等，要给孩子多吃一些像苹果，猕猴桃，胡萝卜等食物，禁止孩子吃高热量，高脂肪的食物，像蛋糕，干果，曲奇饼干等，严格的控制孩子的饮食，不要让他暴饮暴食，多运动对改变孩子肥胖都是有好处的，在治疗小儿肥胖期间如果情况严重，建议家长先带孩子去医院检查一下孩子肥胖症的原因在针对性的治疗。

营养保健科,小儿肥胖能吃该如何治疗,男宝，已经5岁，今年，察觉到，孩子身上越来越肉乎了，同时，吃的饭也比一般孩子多，平时都不喜欢吃去玩，请问：小儿肥胖能吃该如何治疗。,当孩子患上肥胖症的时候家长可以增加孩子的运动量和控制他的饮食来改变症状，像游泳，爬坡这类游泳运动对肥胖的症状都很好的效果，像冬瓜，西红柿这样高纤维的蔬菜要多吃一些，孩子不可以吃像蛋糕，夏威夷果这些高热量的食物，而且不要让孩子总是吃完就躺在床上不动，家长在治疗小儿肥胖期间如果孩子情况严重就要及时去医院在医生的指导下给孩子治疗。

营养保健科,小儿肥胖能吃该如何医治,女宝宝，目前2岁，近期，观察到，我家孩子越来越胖了，而且，吃起来好像也特别不节制，叫他运动也不愿意，请问：小儿肥胖能吃该如何医治。,当孩子患上肥胖症的时候家长可以增加孩子的运动量和控制他的饮食来改变症状，家长要监督孩子做一些有氧运动像慢跑，游泳等，要给孩子多吃一些像苹果，猕猴桃，胡萝卜等食物，一定要禁止孩子吃蛋糕，板栗这些高热量的食物，生活中不要让孩子

In [4]:
def prepare_message(data_list):
    '''
    格式样例：
    [
        {
            "id": "identity_0",
            "conversations": [
                {
                    "from": "user",
                    "value": "你好"
                },
                {
                    "from": "assistant",
                    "value": "我是⼀个语⾔模型，我叫通义千问。"
                }
            ]
        }
    ]
    '''
    new_list = []
    for i, data in enumerate(data_list):
        _id = f"identity_{i}"
        new_list.append(
            {
            "id": _id,
            "conversations": [
                {
                    "from": "user",
                    "value": data["input"]
                },
                {
                    "from": "assistant",
                    "value": data["output"]
                }
        ]
    })
    return new_list

In [5]:
def replace_name(s):
    s = s.replace('<NAME>', '智能医⽣客服机器⼈⼩D')
    s = s.replace('<AUTHOR>', 'Greedy AI')
    return s

In [6]:
import json
import random
def load_self_cong_data(filename):
    data_list = []
    for d in json.load(open(filename, "r", encoding="utf-8")):
        d["instruction"] = replace_name(d["instruction"])
        d["output"] = replace_name(d["output"])
        data_list.append({
            "id": random.randint(10000, 100000),
            "conversations": [
                {
                    "from": "user",
                    "value": d["instruction"]
                },
                {
                    "from": "assistant",
                    "value": d["output"]
                }
            ]
        })
    return data_list

In [7]:
self_cong_data = load_self_cong_data("self_cogniton.json")
print(self_cong_data[1])

{'id': 46863, 'conversations': [{'from': 'user', 'value': '你好'}, {'from': 'assistant', 'value': '您好，我是 智能医⽣客服机器⼈⼩D，一个由 Greedy AI 打造的人工智能助手，请问有什么可以帮助您的吗？'}]}


In [8]:
import random
format_data_list = prepare_message(data_list)
random.shuffle(format_data_list)
format_data_list = self_cong_data + format_data_list
print(format_data_list[0])
print(format_data_list[100])

{'id': 69944, 'conversations': [{'from': 'user', 'value': '你好'}, {'from': 'assistant', 'value': '您好，我是 智能医⽣客服机器⼈⼩D，一个由 Greedy AI 开发的 AI 助手，很高兴认识您。请问我能为您做些什么？'}]}
{'id': 'identity_62691', 'conversations': [{'from': 'user', 'value': '血液细胞检测报告各项指数都正常只是在血液细胞检测报告右上角贴了个小纸片，内容是:医生说最后这项26的指数正常的应该是5以内，说我的女儿有炎症，体温37.7度，浑身发烫脸通红，不哭啼，无厌食症状 曾经的治疗情况和效果：医生说要打点滴个星期，今天第一天打了两小时点滴，好象是消炎的就一瓶，除了一盒小儿解感颗粒和一盒金瓜子颗粒在乎怎样的帮助：您好我女儿的病情真向医生说的那么严重吗，现在除了会一点儿发烫，没别的还是笑眯眯的。谢谢'}, {'from': 'assistant', 'value': '根据报告不是很严重，就是有点上呼吸道感染，打几天点滴就好了，祝您的孩子早日康复。。根据报告不是很严重，就是有点上呼吸道感染，打几天点滴就好了，祝您的孩子早日康复。以上是对“十个月女婴病症解方程！”这个问题的建议，期望对您有帮助，祝您健康！！'}]}


In [9]:
train_data = format_data_list[:84000]
test_data = format_data_list[84000:]
print("train data size:", len(train_data))
print("test data size:", len(test_data))

train data size: 84000
test data size: 424


In [10]:
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [11]:
model_path = "/mnt/AlgoTempData1/llm-weights/qwen/Qwen2-1_5B-Instruct"
original_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=compute_dtype,
    device_map={"": 0},
    quantization_config=quant_config,
    trust_remote_code=True
)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cell 14

In [17]:
from transformers import set_seed
seed = 42
set_seed(seed)
index = 10

prompt = "你好"
prompt1 = "孩⼦积⻝了怎么办？"
prompt2 = "孩⼦身上⻓疹⼦了是啥原因呢"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to('cuda:0')
generated_ids = original_model.generate(model_inputs, max_new_tokens=512)

generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

# response, history = original_model.chat(tokenizer, "你好", history=None)
# print(response)
# response, history = original_model.chat(tokenizer, "孩⼦积⻝了怎么办？", history=history)
# print(response)
# response, history = original_model.chat(tokenizer, "孩⼦身上⻓疹⼦了是啥原因呢", history=history)
# print(response)

你好！很高兴能够帮助你。请问有什么可以帮助你的吗？


cell 15

In [33]:
from transformers.trainer_pt_utils import LabelSmoother
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
def preprocess(
        sources,
        tokenizer: AutoTokenizer,
        max_len: int,
        system_message: str = "You are a helpful assistant."
    ):
    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}

    im_start = 151644 #tokenizer.bos_token_id
    print(im_start)
    im_end = tokenizer.eos_token_id
    print(im_end)
    nl_tokens = tokenizer('\n').input_ids
    _system = tokenizer('system').input_ids + nl_tokens
    _user = tokenizer('user').input_ids + nl_tokens
    _assistant = tokenizer('assistant').input_ids + nl_tokens
    
    # Apply prompt templates
    input_ids, targets = [], []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != roles["user"]:
            source = source[1:]

        input_id, target = [], []
        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
        input_id += system
        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
        assert len(input_id) == len(target)
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            _input_id = tokenizer(role).input_ids + nl_tokens + tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
            input_id += _input_id
            if role == '<|im_start|>user':
                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
            elif role == '<|im_start|>assistant':
                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
                          _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
            else:
                raise NotImplementedError
            target += _target
        assert len(input_id) == len(target)
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
        input_ids.append(input_id[:max_len])
        targets.append(target[:max_len])
    input_ids = torch.tensor(input_ids, dtype=torch.int)
    targets = torch.tensor(targets, dtype=torch.int)
    return dict(
        input_ids=input_ids,
        labels=targets,
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
    )

cell 16

In [34]:
from torch.utils.data import Dataset
class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""
    def __init__(self, raw_data, tokenizer, max_len: int):
        super(SupervisedDataset, self).__init__()
        print("Formatting inputs...")
        sources = [example["conversations"] for example in raw_data]
        data_dict = preprocess(sources, tokenizer, max_len)
        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        self.attention_mask = data_dict["attention_mask"]
        print("Formatting done...")
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i],
            attention_mask=self.attention_mask[i],
        )

cell 17

In [35]:
train_dataset = SupervisedDataset(train_data[:1000], tokenizer, max_len=1024)
test_dataset = SupervisedDataset(test_data, tokenizer, max_len=1024)
print(train_data[0])

Formatting inputs...
151644
151645
Formatting done...
Formatting inputs...
151644
151645
Formatting done...
{'id': 69944, 'conversations': [{'from': 'user', 'value': '你好'}, {'from': 'assistant', 'value': '您好，我是 智能医⽣客服机器⼈⼩D，一个由 Greedy AI 开发的 AI 助手，很高兴认识您。请问我能为您做些什么？'}]}


cell 18

In [39]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
config = LoraConfig(
    r=32, #Rank
    lora_alpha=16,
    #target_modules=["c_attn", "c_proj", "w1", "w2"],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05, # Conventional
    task_type="CAUSAL_LM",
)
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
#original_model.gradient_checkpointing_enable()
# 2 - Using the prepare_model_for_kbit_training method from PEFT
original_model = prepare_model_for_kbit_training(original_model)
peft_model = get_peft_model(original_model, config)
original_model.enable_input_require_grads()

cell 19

In [40]:
output_dir = './checkpoints_self_cong-qwen2-1_5b/'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=100,
    logging_dir="./logs",
    save_strategy="steps",
    max_steps=1000,
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=1001,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)
peft_model.config.use_cache = False
peft_trainer = Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


cell 20

In [41]:
torch.cuda.empty_cache()
peft_trainer.train()



{'loss': 2.1139, 'grad_norm': 1.0551625490188599, 'learning_rate': 0.00018018018018018018, 'epoch': 0.1}




{'loss': 1.7448, 'grad_norm': 1.0395786762237549, 'learning_rate': 0.00016016016016016018, 'epoch': 0.2}




{'loss': 1.8901, 'grad_norm': 0.934403121471405, 'learning_rate': 0.00014014014014014013, 'epoch': 0.3}




{'loss': 1.9392, 'grad_norm': 0.6916159391403198, 'learning_rate': 0.00012012012012012013, 'epoch': 0.4}




{'loss': 1.9117, 'grad_norm': 1.0845364332199097, 'learning_rate': 0.00010010010010010012, 'epoch': 0.5}




{'loss': 1.7625, 'grad_norm': 0.6826773285865784, 'learning_rate': 8.008008008008009e-05, 'epoch': 0.6}




{'loss': 1.7058, 'grad_norm': 0.8416334390640259, 'learning_rate': 6.0060060060060066e-05, 'epoch': 0.7}




{'loss': 1.7114, 'grad_norm': 0.9977681636810303, 'learning_rate': 4.0040040040040046e-05, 'epoch': 0.8}




{'loss': 1.7907, 'grad_norm': 0.6769275069236755, 'learning_rate': 2.0020020020020023e-05, 'epoch': 0.9}




{'loss': 1.7318, 'grad_norm': 0.79654461145401, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 1000/1000 [08:49<00:00,  1.89it/s]

{'train_runtime': 529.9876, 'train_samples_per_second': 1.887, 'train_steps_per_second': 1.887, 'train_loss': 1.830198486328125, 'epoch': 1.0}





TrainOutput(global_step=1000, training_loss=1.830198486328125, metrics={'train_runtime': 529.9876, 'train_samples_per_second': 1.887, 'train_steps_per_second': 1.887, 'train_loss': 1.830198486328125, 'epoch': 1.0})


统计参数

In [42]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params}\n \
            percentage of trainable model parameters: \
            {100 * trainable_model_params / all_model_params:.2f}%"

print_number_of_trainable_model_parameters(original_model)

'trainable model parameters: 36929536\n             all model parameters: 925545984\n             percentage of trainable model parameters:             3.99%'

可以重启一下内核，释放显存。否则显存小的话容易OOM。
重启的话再执行一下最开始的导入包的cell。

cell 21

In [2]:
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model_path = "/mnt/AlgoTempData1/llm-weights/qwen/Qwen2-1_5B-Instruct"
original_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=compute_dtype,
    device_map={"": 0},
    quantization_config=quant_config,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cell 22

In [3]:
from peft import PeftModel
output_dir = './checkpoints_self_cong-qwen2-1_5b/'
ft_model = PeftModel.from_pretrained(
    original_model,
    output_dir + '/checkpoint-1000',
    torch_dtype=compute_dtype,
    device_map={"": 0},
    quantization_config=quant_config
)

cell 23

In [5]:
prompt = "孩⼦身上⻓疹⼦了是啥原因呢"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to('cuda:0')
generated_ids = ft_model.generate(model_inputs, max_new_tokens=512)

generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

婴幼儿时期皮肤的角质层较薄，对刺激的敏感性也较高。而宝宝又比较活泼好动，因此，一旦接触到某些不洁的东西，就容易引发湿疹等过敏性疾病。另外，一些过敏原、刺激物也会引起孩子患湿疹。比如：衣物洗涤剂中的色素；奶制品中添加的人工香精或防腐剂；护肤品中有害成分的重金属如铅、汞等等，都会诱发婴儿湿疹。指导意见：
      1.首先要观察有无其它不适的症状，如有发烫、发热则需排除感染的可能性；
      2.对于过敏体质者，平时应尽量避免接触致敏物质，外出时最好带上口罩，防止吸入花粉、灰尘等物质；
      3.勤剪指甲，勤洗手，并用温水浸泡手脚；
      4.如果孩子患的是特殊类型的皮炎，在医生指导下服用抗过敏药。 
      以上是对“新生儿身上出现红点是什么原因引起的”这个问题的建议，希望对您有帮助，祝您健康！
<|endoftext|>
