In [1]:
import torch
print(torch.__version__)                     # e.g. 2.7.0+cu128 or higher
print(torch.version.cuda)                    # should be 12.8
print(torch.cuda.is_available())             # True
print(torch.cuda.get_device_name(0))         # NVIDIA GeForce RTX 5070 ...
print(torch.cuda.get_device_capability(0))   # (12, 0)
print(torch.cuda.get_arch_list())            # Should include 'sm_120', 'compute_120'

2.10.0+cu128
12.8
True
NVIDIA GeForce RTX 5070
(12, 0)
['sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_90', 'sm_100', 'sm_120']


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_PATH = "./qwen25-coder"   # local folder

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)

from accelerate import infer_auto_device_map, init_empty_weights

# Optional: custom device map to minimize offload
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",  # or try "cuda:0" to force single GPU
    offload_folder="offload",  # creates folder for disk offload if needed
    trust_remote_code=True
)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_PATH,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
#     device_map="auto",  # or try "cuda:0" to force single GPU
#     offload_folder="offload",  # creates folder for disk offload if needed
#     trust_remote_code=True
# )
messages = [
    {"role": "system", "content": "You are a precise coding assistant."},
    {"role": "user", "content": "Implement a Fenwick tree in Python."}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.2,
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import infer_auto_device_map, init_empty_weights

MODEL_PATH = "./qwen25-coder"   # local folder with the model files

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)

# Load model (your current setup – keep low_cpu_mem_usage + offload if VRAM is tight)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",          # "cuda:0" if you want to force single GPU
    offload_folder="offload",   # for disk offload if needed
    trust_remote_code=True
)

# Optional: for better quality with temperature > 0
# You can also add eos_token_id=tokenizer.eos_token_id if it generates forever

print("Qwen2.5-Coder chat ready! Type your query below.")
print("Type 'exit', 'quit' or 'q' to stop.\n")

# Simple conversation loop
while True:
    user_query = input("You: ").strip()
    
    if user_query.lower() in ["exit", "quit", "q"]:
        print("Goodbye!")
        break
    
    if not user_query:
        print("Please enter a query.\n")
        continue
    
    # Build messages dynamically – add new user message each time
    messages = [
        {"role": "system", "content": "You are a precise coding assistant."},
        {"role": "user", "content": user_query}
    ]
    
    # Format with Qwen's chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=800,           # increased a bit for longer answers
            temperature=0.2,
            do_sample=True,               # usually better with temp > 0
            top_p=0.95,                   # helps avoid repetition
        )
    
    # Decode – skip the input prompt part, only show new generated tokens
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Optional: strip the echoed prompt if you want cleaner output
    # (Qwen often repeats the user/system part)
    if generated_text.startswith(prompt.rstrip()):
        response = generated_text[len(prompt.rstrip()):].strip()
    else:
        response = generated_text
    
    print("\nAssistant:", response)
    print("-" * 60 + "\n")

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

Qwen2.5-Coder chat ready! Type your query below.
Type 'exit', 'quit' or 'q' to stop.


Assistant: system
You are a precise coding assistant.
user
hi whats your name
assistant
Hello! I'm an AI language model created by OpenAI, and you can call me Assistant. How can I help you today?
------------------------------------------------------------



KeyboardInterrupt: 

: 