In [1]:
# 定义全局变量和参数
model_name_or_path = 'THUDM/chatglm3-6b'  # 模型ID或本地路径
train_data_path = 'shibing624/AdvertiseGen'    # 训练数据路径
eval_data_path = None                     # 验证数据路径，如果没有则设置为None
seed = 8                                 # 随机种子
max_input_length = 512                    # 输入的最大长度
max_output_length = 1536                  # 输出的最大长度
lora_rank = 4                             # LoRA秩
lora_alpha = 32                           # LoRA alpha值
lora_dropout = 0.05                       # LoRA Dropout率
resume_from_checkpoint = None             # 如果从checkpoint恢复训练，指定路径
prompt_text = ''                          # 所有数据前的指令文本
compute_dtype = 'fp32'                    # 计算数据类型（fp32, fp16, bf16）

In [2]:
from datasets import load_dataset

dataset = load_dataset(train_data_path)

Downloading readme:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/53.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/498k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'summary'],
        num_rows: 114599
    })
    validation: Dataset({
        features: ['content', 'summary'],
        num_rows: 1070
    })
})

In [4]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [11]:
show_random_elements(dataset["train"], num_examples=3)

Unnamed: 0,content,summary
0,类型#裙*版型#显瘦*材质#雪纺*图案#线条*裙领型#高领*裙款式#亮片*裙款式#抽褶,雪纺材质的小衫，非常适合春天穿着，带来春光烂漫的舒适感。修身的小衫，衣身带有松紧的褶皱设计，搭配亮片的装饰，在阳光下闪烁着耀眼的光芒，非常漂亮。略带高领的版型，能够很好的修饰颈部线条，带来更加乖巧和优雅的穿着效果。小衫非常轻薄，透气感十足，无论是单穿还是内搭，都非常显气质。
1,类型#裙*材质#棉*颜色#黑白*风格#青春*风格#性感*图案#条纹*裙型#a字*裙腰型#高腰*裙领型#一字领*裙款式#收腰,此款纯棉a字裙，带着少女的娇俏，展现了不一样的青春姿态。此款简单的黑白条纹样式，在提升肤色的同时也能瞬间抓住人们的眼球。一字领的设计可以展现出自己的香肩和锁骨，尽显优雅的性感迷人气息。高腰的设计能够拉长腰线，轻松凹出大长腿，a字型的版型包容度高，不挑身材。收腰的款式，不仅能给女士增添一种神秘感，而且还能塑造出玲珑有致的身材曲线。
2,类型#裙*风格#复古*风格#简约*图案#复古*图案#波点*图案#印花*裙型#直筒裙*裙长#连衣裙*裙袖长#短袖*裙款式#腰带,波点元素强势回潮，设计师选用较大的波点印花来演绎，显大方之余还使得简约的短袖连衣裙多了摩登复古的感觉。直筒的版型设计配上腰带装饰，上身更显好比例，打造层次感造型。


In [12]:
from transformers import AutoTokenizer

# revision='b098244' 版本对应的 ChatGLM3-6B 设置 use_reentrant=False
# 最新版本 use_reentrant 被设置为 True，会增加不必要的显存开销
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          trust_remote_code=True,
                                          revision='b098244')

In [13]:
# tokenize_func 函数
def tokenize_func(example, tokenizer, ignore_label_id=-100):
    """
    对单个数据样本进行tokenize处理。

    参数:
    example (dict): 包含'content'和'summary'键的字典，代表训练数据的一个样本。
    tokenizer (transformers.PreTrainedTokenizer): 用于tokenize文本的tokenizer。
    ignore_label_id (int, optional): 在label中用于填充的忽略ID，默认为-100。

    返回:
    dict: 包含'tokenized_input_ids'和'labels'的字典，用于模型训练。
    """

    # 构建问题文本
    question = prompt_text + example['content']
    if example.get('input', None) and example['input'].strip():
        question += f'\n{example["input"]}'

    # 构建答案文本
    answer = example['summary']

    # 对问题和答案文本进行tokenize处理
    q_ids = tokenizer.encode(text=question, add_special_tokens=False)
    a_ids = tokenizer.encode(text=answer, add_special_tokens=False)

    # 如果tokenize后的长度超过最大长度限制，则进行截断
    if len(q_ids) > max_input_length - 2:  # 保留空间给gmask和bos标记
        q_ids = q_ids[:max_input_length - 2]
    if len(a_ids) > max_output_length - 1:  # 保留空间给eos标记
        a_ids = a_ids[:max_output_length - 1]

    # 构建模型的输入格式
    input_ids = tokenizer.build_inputs_with_special_tokens(q_ids, a_ids)
    question_length = len(q_ids) + 2  # 加上gmask和bos标记

    # 构建标签，对于问题部分的输入使用ignore_label_id进行填充
    labels = [ignore_label_id] * question_length + input_ids[question_length:]

    return {'input_ids': input_ids, 'labels': labels}


In [14]:
column_names = dataset['train'].column_names
tokenized_dataset = dataset['train'].map(
    lambda example: tokenize_func(example, tokenizer),
    batched=False, 
    remove_columns=column_names
)

Map:   0%|          | 0/114599 [00:00<?, ? examples/s]

In [15]:
show_random_elements(tokenized_dataset, num_examples=1)

Unnamed: 0,input_ids,labels
0,"[64790, 64792, 30910, 33467, 31010, 56532, 30998, 55090, 54888, 31010, 49899, 30998, 38317, 31010, 38683, 54901, 30998, 32799, 31010, 31903, 30998, 32799, 31010, 34435, 30998, 37505, 31010, 37216, 30998, 56532, 54888, 31010, 55509, 55932, 56532, 30998, 56532, 40877, 31010, 55097, 55759, 30998, 56532, 56278, 54888, 31010, 54589, 56278, 30910, 31992, 31986, 56055, 54802, 42927, 56778, 55091, 31123, 45762, 35066, 31677, 43385, 54530, 33055, 31155, 41180, 31873, 43385, 54530, 41945, 31123, 12345, 7471, 35462, 43385, 36480, 34435, 54530, 55509, 55932, 56532, 55090, 54888, 31123, 36477, 34319, 32922, 31123, 31903, 34435, 31123, 55097, 55759, 34481, 41806, 54892, 36436, 31155, 54589, 56278, 54530, ...]","[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 30910, 31992, 31986, 56055, 54802, 42927, 56778, 55091, 31123, 45762, 35066, 31677, 43385, 54530, 33055, 31155, 41180, 31873, 43385, 54530, 41945, 31123, 12345, 7471, 35462, 43385, 36480, 34435, 54530, 55509, 55932, 56532, 55090, 54888, 31123, 36477, 34319, 32922, 31123, 31903, 34435, 31123, 55097, 55759, 34481, 41806, 54892, 36436, 31155, 54589, 56278, 54530, ...]"


In [16]:
tokenized_dataset = tokenized_dataset.shuffle(seed=seed)

In [17]:
tokenized_dataset = tokenized_dataset.flatten_indices()

Flattening the indices:   0%|          | 0/114599 [00:00<?, ? examples/s]

In [18]:
import torch
from typing import List, Dict, Optional

# DataCollatorForChatGLM 类
class DataCollatorForChatGLM:
    """
    用于处理批量数据的DataCollator，尤其是在使用 ChatGLM 模型时。

    该类负责将多个数据样本（tokenized input）合并为一个批量，并在必要时进行填充(padding)。

    属性:
    pad_token_id (int): 用于填充(padding)的token ID。
    max_length (int): 单个批量数据的最大长度限制。
    ignore_label_id (int): 在标签中用于填充的ID。
    """

    def __init__(self, pad_token_id: int, max_length: int = 2048, ignore_label_id: int = -100):
        """
        初始化DataCollator。

        参数:
        pad_token_id (int): 用于填充(padding)的token ID。
        max_length (int): 单个批量数据的最大长度限制。
        ignore_label_id (int): 在标签中用于填充的ID，默认为-100。
        """
        self.pad_token_id = pad_token_id
        self.ignore_label_id = ignore_label_id
        self.max_length = max_length

    def __call__(self, batch_data: List[Dict[str, List]]) -> Dict[str, torch.Tensor]:
        """
        处理批量数据。

        参数:
        batch_data (List[Dict[str, List]]): 包含多个样本的字典列表。

        返回:
        Dict[str, torch.Tensor]: 包含处理后的批量数据的字典。
        """
        # 计算批量中每个样本的长度
        len_list = [len(d['input_ids']) for d in batch_data]
        batch_max_len = max(len_list)  # 找到最长的样本长度

        input_ids, labels = [], []
        for len_of_d, d in sorted(zip(len_list, batch_data), key=lambda x: -x[0]):
            pad_len = batch_max_len - len_of_d  # 计算需要填充的长度
            # 添加填充，并确保数据长度不超过最大长度限制
            ids = d['input_ids'] + [self.pad_token_id] * pad_len
            label = d['labels'] + [self.ignore_label_id] * pad_len
            if batch_max_len > self.max_length:
                ids = ids[:self.max_length]
                label = label[:self.max_length]
            input_ids.append(torch.LongTensor(ids))
            labels.append(torch.LongTensor(label))

        # 将处理后的数据堆叠成一个tensor
        input_ids = torch.stack(input_ids)
        labels = torch.stack(labels)

        return {'input_ids': input_ids, 'labels': labels}


In [19]:
# 准备数据整理器
data_collator = DataCollatorForChatGLM(pad_token_id=tokenizer.pad_token_id)

In [20]:
from transformers import AutoModel, BitsAndBytesConfig

_compute_dtype_map = {
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16
}

# QLoRA 量化配置
q_config = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_quant_type='nf4',
                              bnb_4bit_use_double_quant=True,
                              bnb_4bit_compute_dtype=_compute_dtype_map['bf16'])


In [21]:
# revision='b098244' 版本对应的 ChatGLM3-6B 设置 use_reentrant=False
# 最新版本 use_reentrant 被设置为 True，会增加不必要的显存开销
model = AutoModel.from_pretrained(model_name_or_path,
                                  quantization_config=q_config,
                                  device_map='auto',
                                  trust_remote_code=True,
                                  revision='b098244')

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [22]:
# 获取当前模型占用的 GPU显存（差值为预留给 PyTorch 的显存）
memory_footprint_bytes = model.get_memory_footprint()
memory_footprint_mib = memory_footprint_bytes / (1024 ** 2)  # 转换为 MiB

print(f"{memory_footprint_mib:.2f}MiB")

3739.69MiB


In [23]:
from peft import TaskType, LoraConfig, get_peft_model, prepare_model_for_kbit_training

kbit_model = prepare_model_for_kbit_training(model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [24]:
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['chatglm']

In [25]:
target_modules

['query_key_value']

In [26]:
lora_config = LoraConfig(
    target_modules=target_modules,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM
)

In [27]:
qlora_model = get_peft_model(kbit_model, lora_config)

In [28]:
qlora_model.print_trainable_parameters()

trainable params: 974,848 || all params: 6,244,558,848 || trainable%: 0.01561115883009451


In [29]:
from transformers import TrainingArguments, Trainer

training_demo_args = TrainingArguments(
    output_dir=f"models/{model_name_or_path}",          # 输出目录
    per_device_train_batch_size=16,                     # 每个设备的训练批量大小
    gradient_accumulation_steps=8,                     # 梯度累积步数
    learning_rate=1e-3,                                # 学习率
    max_steps=800,                                     # 训练步数
    lr_scheduler_type="linear",                        # 学习率调度器类型
    warmup_ratio=0.1,                                  # 预热比例
    logging_steps=20,                                 # 日志记录步数
    save_strategy="steps",                             # 模型保存策略
    save_steps=40,                                    # 模型保存步数
    optim="adamw_torch",                               # 优化器类型
    fp16=True,                                        # 是否使用混合精度训练
)

2024-05-13 15:48:47.532113: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-13 15:48:48.545008: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [30]:
trainer = Trainer(
        model=qlora_model,
        args=training_demo_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )

In [31]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
20,4.7653
40,3.7865
60,3.4948
80,3.3724
100,3.3475
120,3.2665
140,3.2608
160,3.2464
180,3.2354
200,3.2161




TrainOutput(global_step=800, training_loss=3.1983706760406494, metrics={'train_runtime': 12489.7847, 'train_samples_per_second': 8.199, 'train_steps_per_second': 0.064, 'total_flos': 6.185489754579272e+17, 'train_loss': 3.1983706760406494, 'epoch': 0.89})

In [32]:
trainer.model.save_pretrained(f"models/glm/{model_name_or_path}")

In [35]:
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# 定义全局变量和参数
model_name_or_path = 'THUDM/chatglm3-6b'  # 模型ID或本地路径
peft_model_path = f"models/glm/{model_name_or_path}"

In [36]:
config = PeftConfig.from_pretrained(peft_model_path)

q_config = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_quant_type='nf4',
                              bnb_4bit_use_double_quant=True,
                              bnb_4bit_compute_dtype=torch.float32)

base_model = AutoModel.from_pretrained(config.base_model_name_or_path,
                                       quantization_config=q_config,
                                       trust_remote_code=True,
                                       device_map='auto')
base_model.requires_grad_(False)
base_model.eval()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear4bit(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear4bit(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear4bit(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_la

In [50]:
input_text = '请按照以下内容整理出广告词：类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领'
print(f'输入：\n{input_text}')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)

输入：
请按照以下内容整理出广告词：类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案


In [51]:
response, history = base_model.chat(tokenizer=tokenizer, query=input_text)
print(f'ChatGLM3-6B 微调前：\n{response}')

ValueError: not enough values to unpack (expected 2, got 1)

In [45]:
model = PeftModel.from_pretrained(base_model, peft_model_path)


In [49]:
response, history = model.chat(tokenizer=tokenizer, query=input_text)
print(f'ChatGLM3-6B 微调后: \n{response}')

ChatGLM3-6B 微调后: 
1. 时尚百搭，经典款式，走在大街上，吸睛无数，穿出美丽自信，展现迷人魅力。
2. 面料轻盈，透气性好，穿着舒适，不勒肉，不透视，彰显品质，让你轻松驾驭各种场合。
3. 时尚设计，精致剪裁，立体剪裁，打造完美身材，展现迷人曲线，凸显优雅气质。
4. 质量上乘，做工精细，做工精细，穿着舒适，展现魅力，尽显优雅气质。
5. 时尚百搭，经典款式，走在大街上，吸睛无数，穿出美丽自信，展现迷人魅力。


In [1]:
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# 定义全局变量和参数
model_name_or_path = 'THUDM/chatglm3-6b'  # 模型ID或本地路径
peft_model_path = f"models/{model_name_or_path}"

In [2]:
config = PeftConfig.from_pretrained(peft_model_path)

q_config = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_quant_type='nf4',
                              bnb_4bit_use_double_quant=True,
                              bnb_4bit_compute_dtype=torch.float32)

base_model = AutoModel.from_pretrained(config.base_model_name_or_path,
                                       quantization_config=q_config,
                                       trust_remote_code=True,
                                       device_map='auto')
base_model.requires_grad_(False)
base_model.eval()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear4bit(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear4bit(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear4bit(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_la

In [3]:
input_text = '类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领'
print(f'输入：\n{input_text}')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)


输入：
类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领


In [4]:
response, history = base_model.chat(tokenizer=tokenizer, query=input_text)
print(f'ChatGLM3-6B 微调前：\n{response}')

2024-05-14 14:58:37.533953: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-14 14:58:38.954989: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ChatGLM3-6B 微调前：
你好，看起来你提供的信息有些混乱，但我能理解你在询问一些关于裙子的信息。以下是我理解的一些信息：

- "裙"：指的是一条裙子。
- "#裙"：可能是一种特定的款式或者风格，但我目前无法理解它具体代表什么。
- "版型"：指的是衣服的形状和剪裁，可能用于描述裙子的形状和剪裁。
- "显瘦"：可能是一种要求，指的是裙子设计能让身材看起来更瘦。
- "风格"：指的是裙子的设计或者整体感觉。
- "文艺"、"简约"、"图案"、"印花"、"撞色"：可能都是裙子的设计元素或风格。
- "裙下摆"：指的是裙子的下摆设计。
- "压褶"：可能是一种裙子的装饰或者设计元素。
- "裙长"：指的是裙子的长度。
- "连衣裙"：指的是一类长度及以上的裙子。
- "裙领型"：指的是裙子的领口形状。
- "圆领"：指的是裙子的领口是圆形的。

如果你能提供更清晰的问题或者信息，我会很乐意为你提供更多的帮助。


In [5]:
model = PeftModel.from_pretrained(base_model, peft_model_path)


In [6]:
response, history = model.chat(tokenizer=tokenizer, query=input_text)
print(f'ChatGLM3-6B 微调后: \n{response}')

ChatGLM3-6B 微调后: 
文艺气息浓厚的连衣裙，简约圆领设计，修饰颈部曲线，凸显优美脖颈。裙身采用撞色印花装饰，视觉冲击力强，时尚有型，修饰身形。修身显瘦，裙摆采用压褶设计，增加裙子的层次感，凸显时尚感。
