In [1]:
class args:
    max_seq_length = 1024
    model_name = "/gemini/data-2"
    train_file = "/gemini/data-3/vicunaformatfixedfinal.json"
    # output_file = "/gemini/code/build_psymodel/E_output"
    output_file = "/gemini/E_output"
    prompt = "You're a psychotherapist, and you're engaging in a conversation with visitors in a warm and welcoming manner." \
             "You're encouraged to inquire multiple times about the visitors' inner desires, emphasizing empathy and respecting their feelings." \
             "Adjust your responses based on their feedback, ensuring they align with the visitors' circumstances and needs."

In [2]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from peft import prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

# os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4")

model = AutoModelForCausalLM.from_pretrained(
    args.model_name,
    device_map='auto',
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    # trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(args.model_name) # trust_remote_code=True


model

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:40<00:00,  5.04s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-39): 40 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=True)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=True)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13696, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13696, bias=False)
          (down_proj): Linear4bit(in_features=13696, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSN

In [3]:
tokenizer.im_start_id = tokenizer.encode('<|im_start|>')[0]
tokenizer.im_end_id = tokenizer.encode('<|im_end|>')[0]
tokenizer.eos_token = tokenizer.pad_token

In [4]:
from peft import LoraConfig, get_peft_model, PeftModel


config = LoraConfig(
    r=8, #attention heads
    lora_alpha=16, #alpha scaling
    # target_modules=["c_attn","c_proj","w1","w2"],
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM", # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)

# Load the Lora model
# model = PeftModel.from_pretrained(model, "yooshijay/qwen-14b-version2", is_trainable=True)

In [5]:
import transformers
from datasets import load_dataset
from torch.utils.data import Dataset
from typing import Any, Dict, List
import json

class QwenSFTDataset(Dataset):
    def __init__(self, file, tokenizer, max_seq_length, prompt):
        self.tokenizer = tokenizer
        self.im_start_id = tokenizer.im_start_id
        self.im_end_id = tokenizer.im_end_id
        self.enter_token_ids = tokenizer.encode('\n')   # 回车键
        self.max_seq_length = max_seq_length
        self.prompt = prompt
        # logger.info('Loading data: {}'.format(file))
        with open(file, 'r', encoding='utf8') as f:
            data_list = json.load(f)

        # logger.info("there are {} data in dataset".format(len(data_list)))
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, index):
        """
        数据拼接格式如下：
        <|im_start|>system
        You are a helpful assistant.<|im_end|>
        <|im_start|>user
        你好呀<|im_end|>
        <|im_start|>assistant
        你好，我是xxx，很高兴为您服务<|im_end|><|endoftext|>
        """
        data = self.data_list[index]
        # data = json.loads(data)
        if 'system' in data.keys():
            system = data['system'].strip()
        else:
            system = self.prompt
        conversations = data['conversations']

        # 收集模型输入
        system_text = f'<|im_start|>system\n{system}<|im_end|>\n'
        input_ids = self.tokenizer.encode(system_text, add_special_tokens=False)
        target_mask = [0] * len(input_ids)

        # 拼接多轮对话
        for i, conv in enumerate(conversations):
            if conv['from'] == 'human':
                human = conv['value'].strip()
                input_tokens = self.tokenizer.encode(f'<|im_start|>user\n{human}<|im_end|>\n', add_special_tokens=False)
                input_ids += input_tokens
                # input_tokens部分不计算loss
                target_mask += [0] * len(input_tokens)
            else:
                assistant = conv['value'].strip()
                output_tokens = self.tokenizer.encode(f'<|im_start|>assistant\n{assistant}<|im_end|>' + tokenizer.eos_token + '\n', add_special_tokens=False)
                input_ids += output_tokens
                # '<|im_start|>assistant\n'占3个token，结尾的'\n'占1个token，不计算它们的loss
                target_mask += [0] * 3 + [1] * (len(output_tokens) - 4) + [0] 

        assert len(input_ids) == len(target_mask)
        # 对长度进行截断
        input_ids = input_ids[:self.max_seq_length]
        target_mask = target_mask[:self.max_seq_length]
        attention_mask = [1] * len(input_ids)
        assert len(input_ids) == len(target_mask) == len(attention_mask)
        inputs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_mask': target_mask
        }
        return inputs

class SFTDataCollator(object):
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.pad_token_id = tokenizer.pad_token_id

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
        # 找出batch中的最大长度
        lengths = [len(x['input_ids']) for x in batch if x['input_ids'] is not None]
        # 取出batch中的最大长度，如果超过max_seq_length，则取max_seq_length
        batch_max_len = min(max(lengths), self.max_seq_length)
        # batch_max_len = self.max_seq_length

        input_ids_batch, attention_mask_batch, target_mask_batch = [], [], []
        # truncate and padding
        for x in batch:
            input_ids = x['input_ids']
            attention_mask = x['attention_mask']
            target_mask = x['target_mask']
            if input_ids is None:
                # logger.info('some input_ids is None')
                continue
            padding_len = batch_max_len - len(input_ids)
            # padding
            input_ids = input_ids + [self.pad_token_id] * padding_len
            attention_mask = attention_mask + [0] * padding_len
            target_mask = target_mask + [0] * padding_len
            # truncate
            input_ids = input_ids[:self.max_seq_length]
            attention_mask = attention_mask[:self.max_seq_length]
            target_mask = target_mask[:self.max_seq_length]

            input_ids_batch.append(input_ids)
            attention_mask_batch.append(attention_mask)
            target_mask_batch.append(target_mask)

        # 将list转换为tensor，得到最终的的模型输入
        input_ids_batch = torch.tensor(input_ids_batch, dtype=torch.long)
        attention_mask_batch = torch.tensor(attention_mask_batch, dtype=torch.long)
        target_mask_batch = torch.tensor(target_mask_batch, dtype=torch.long)

        labels = torch.where(target_mask_batch == 1, input_ids_batch, -100)
        inputs = {
            'input_ids': input_ids_batch,
            'attention_mask': attention_mask_batch,
            'labels': labels
        }
        return inputs

train_dataset = QwenSFTDataset(args.train_file, tokenizer, args.max_seq_length, args.prompt)
data_collator = SFTDataCollator(tokenizer, args.max_seq_length)

In [None]:
from transformers import (DataCollatorForSeq2Seq, Trainer)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    data_collator=data_collator,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        # warmup_steps=100,
        # max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        output_dir='temp',
        # output_dir='/gemini/code/build_psymodel/temp',
        num_train_epochs=2,
        remove_unused_columns = False
    )
)

trainer.train(resume_from_checkpoint=False)
model.save_pretrained(args.output_file)

# model.push_to_hub("yooshijay/qwen-14b-version4",
#                   use_auth_token=True,
#                   commit_message="basic training",
#                   private=True)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,1.3294
20,0.997
30,0.9555
