In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer from local path
model_path = "../models/Qwen3-1.7B"

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 to save memory
    device_map="auto"  # Auto-assign to GPU if available
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Model loaded successfully!")


In [5]:
def generate_response(prompt, max_new_tokens=200, temperature=0.7, top_p=0.9):
    """
    Generate text from a prompt
    """
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode and return
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

print("Inference function ready!")

Inference function ready!


In [6]:
# Test the model
prompt = "Explain quantum computing in simple terms:"

print(f"Prompt: {prompt}\n")
response = generate_response(prompt, max_new_tokens=150)
print(f"Response:\n{response}")

Prompt: Explain quantum computing in simple terms:



KeyboardInterrupt: 