In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments 

model_path = "/Users/jiangyushi/Work/llm_model"
model_name = "meta-llama/Llama-3.2-3B-Instruct"
llama3_model = os.path.join(model_path, model_name)


# 加载模型
def load_model_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        llama3_model
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        llama3_model,
        device_map='auto',
        # quantization_config=quantization_config,
        torch_dtype=torch.bfloat16
    )

    return model, tokenizer

model, tokenizer = load_model_tokenizer()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it]
Some parameters are on the meta device because they were offloaded to the disk.


In [2]:
# 定义聊天模板
llama3_chat_template = dict(
    SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
            '{system}<|eot_id|>'),
    INSTRUCTION=(
        '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
        '<|start_header_id|>assistant<|end_header_id|>\n\n'),
    BEGIN_WORDS='<|begin_of_text|>',
    STOP_WORDS='<|eot_id|>'
)

# # 示例对话
# conversation = [
#     {"role": "system", "content": "您是一个有用的助手。"},
#     {"role": "user", "content": "今天天气怎么样？"},
# ]

# 应用聊天模板
def apply_chat_template(conversation, template):
    # formatted_conversation = template['BEGIN_WORDS']
    formatted_conversation = ''
    for message in conversation:
        if message['role'] == 'system':
            formatted_conversation += template['SYSTEM'].format(system=message['content'])
        elif message['role'] == 'user':
            formatted_conversation += template['INSTRUCTION'].format(input=message['content'])
        elif message['role'] == 'assistant':
            formatted_conversation += template['INSTRUCTION'].format(input=message['content'])
    return formatted_conversation


In [3]:
# 提取模型回答
def clear_output(complete_txt, original_txt):
    answer = complete_txt[len(llama3_chat_template['BEGIN_WORDS'] + original_txt) : -len(llama3_chat_template['STOP_WORDS'])]    
    return answer


# 生成回答
def generate_answer(model, tokenizer, usr_txt):
    sys_txt = "You are a helpful assistant"
    conversation = [
        {"role": "system", "content": sys_txt},
        {"role": "user", "content": usr_txt},
    ]

    # # tokenizer 自带的 apply_chat_template （注意编码后会再次添加BEGIN_TOKEN, 因此tokenize=True）
    # chat_template = [{'content': sys_txt, 'role': 'system'},{'content': usr_txt, 'role': 'user'}]
    # input_txt = tokenizer.apply_chat_template(chat_template, tokenize=False, add_generation_prompt=True)

    # 自定义
    input_txt = apply_chat_template(conversation, llama3_chat_template)
    input_ids = tokenizer(input_txt, return_tensors="pt").to(model.device)

    outputs_id = model.generate(**input_ids, max_new_tokens = 512, eos_token_id = tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)
    outputs = tokenizer.batch_decode(outputs_id, skip_special_tokens=False)
    answer = clear_output(outputs[0], input_txt)

    return answer