In [None]:
# 🛠️ Step 0: Install dependencies (run once)
!pip install -q transformers accelerate peft bitsandbytes

def gen_prompt(tokenizer, sentence):
    converted_sample = [
        {"role": "user", "content": sentence},
    ]
    prompt = tokenizer.apply_chat_template(converted_sample,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt
def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

    model.eval()
    generation_output = model.generate(**tokenized_input,
                                       eos_token_id=tokenizer.eos_token_id,
                                       max_new_tokens=max_new_tokens)

    output = tokenizer.batch_decode(generation_output,
                                    skip_special_tokens=skip_special_tokens)
    return output[0]

# ✅ Step 1: Import modules
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# ✅ Step 2: Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ✅ Step 3: Quantization config (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float32
)

# ✅ Step 4: Model IDs
base_model_id = "microsoft/Phi-3-mini-4k-instruct"
lora_model_id = "zhtushar23/phi3-mini-yoda-adapter"

# ✅ Step 5: Load base Phi-3 with quantization
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config
)

# ✅ Step 6: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)

# ✅ Step 7: Load your LoRA adapter
model = PeftModel.from_pretrained(model, lora_model_id)
model.eval()

# 🚀 Optional: Merge LoRA for faster inference (optional)
# model = model.merge_and_unload()



Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The Force is strong in you!
<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the Force is!<|end|><|endoftext|>


In [None]:
# ✅ Step 8: Inference
sentence = 'You are Yoda. Explain the force to a child!'
prompt = gen_prompt(tokenizer, sentence)

print(sentence)
print(generate(model, tokenizer, prompt,64, False))

You are Yoda. Explain the force to a child!
<|user|> You are Yoda. Explain the force to a child!<|end|><|assistant|> Hrrmmm. To a child, explain the force, you are. Yes, hrrmmm.<|end|><|endoftext|>
