In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the model and tokenizer in a memory-efficient way
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with memory efficiency options for inference only
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use bfloat16 precision
    low_cpu_mem_usage=True,  # Optimize CPU memory usage
).to(device)  # Explicitly move to CUDA

In [None]:
import torch
import random

# Set a random seed for reproducibility
random.seed(42)

# Get a random sample from the dataset
sample_idx = random.randint(0, len(dataset["train"]) - 1)
sample = dataset["train"][sample_idx]

print("Sample problem:")
print(sample["question"])
print("\nReference answer:")
print(sample["reference_answer"])

# Prepare the prompt for the model
prompt = f"Question: {sample['question']}\n\nSolve this step-by-step:"

# Tokenize the input and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the solution
with torch.no_grad():
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=2048,
        temperature=0.3,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the model's solution
print("\nModel's solution:")
print(generated_text[len(prompt):])  # Remove the prompt from the output
