In [None]:
import torch

print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Prints your GPU name

In [None]:
import os

import torch
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer

# Enable CUDA debugging
os.environ["CUDA_LA4ALL_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# Clear GPU memory
torch.cuda.empty_cache()

# Load environment variables from .env file
load_dotenv()

# Retrieve the Hugging Face token
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN not found in .env file. Please set it as HF_TOKEN=your_token")

# Specify the model
model_name = "google/gemma-3-4b-it"

# Load tokenizer and model with 4-bit quantization
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,  # Enable 4-bit quantization
    trust_remote_code=True,
)

In [None]:
# Prepare input prompt
prompt = "Tell me a fun fact about the moon."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate response (greedy search to avoid multinomial)
outputs = model.generate(
    **inputs,
    max_new_tokens=50,  # Reduced for lower memory usage
    do_sample=False,  # Disable sampling to avoid multinomial
)

# Decode and print response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
import torch

print(torch.version.cuda)