In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

In [2]:
cache_path = r"D:\TrainedModel"
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", cache_dir=cache_path)

In [3]:
model = PeftModel.from_pretrained(model, "D:\CoachBot\SFTuned")
tokenizer=AutoTokenizer.from_pretrained("D:\CoachBot\SFTuned", use_fast=False)

In [4]:
model = model.to("cuda")
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=1024

In [None]:
import torch
# 2. 单轮对话函数
def chat_qwen3(messages, 
               enable_thinking: bool = True, 
               max_new_tokens: int = 512):
    """
    messages: List[Dict[str, str]]，格式如 [{"role":"user","content": "..."}]
    enable_thinking: 是否开启 <think> 模式
    """
    # 2.1 应用聊天模板，拼接 prompt + 回复提示
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )
    # 2.2 编码并送入模型
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    # 2.3 生成
    generation = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.8,
        top_k=20
    )
    # 2.4 切掉输入部分，只保留模型新生成的 token
    output_ids = generation[0][len(inputs.input_ids[0]):]
    return tokenizer.decode(output_ids, skip_special_tokens=True)

# 3. 测试单轮
messages = [{"role": "user", "content": "给我写一段关于大模型的简介。"}]
response = chat_qwen3(messages)
print("Bot:", response)

# 4. 多轮示例（可选）
class QwenChatbot:
    def __init__(self, model_name="Qwen/Qwen3-8B"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name,
                                                           torch_dtype=torch.float16,
                                                           device_map="auto")
        self.history = []
    def generate(self, user_input, enable_thinking=True):
        self.history.append({"role": "user", "content": user_input})
        text = self.tokenizer.apply_chat_template(
            self.history,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        gen = self.model.generate(**inputs, max_new_tokens=512)
        out_ids = gen[0][len(inputs.input_ids[0]):]
        reply = self.tokenizer.decode(out_ids, skip_special_tokens=True)
        self.history.append({"role": "assistant", "content": reply})
        return reply

# 使用多轮 Chatbot
bot = QwenChatbot()
print("Bot:", bot.generate("你好，Qwen3！"))
print("Bot:", bot.generate("请告诉我，AI 的未来是什么？", enable_thinking=False))