In [None]:
from huggingface_hub import login
login(token="Your_HF_Token")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# base model
model_id = "unsloth/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# add the exact same token we used during training
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

## load saved adapter
#print("Loading adapter...")
## merging adapter into base model
#model = PeftModel.from_pretrained(model, "riddle_model_adapter")

In [None]:
# Eval mode (disables dropout)
model.eval()

In [None]:

def test_question(question):
    print(f"\nAnsswering: '{question}'")
    
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": question}],
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=124,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],
            do_sample=False, # Greedy decoding (No randomness)
            temperature=None,
            top_p=None,
        )
    
    # Decode
    response_tokens = outputs[0][inputs.input_ids.shape[1]:]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)
    
    print(f"RESULT: {response}")

In [None]:
test_question("What is fire?")
test_question("What is 2 plus 2?")